From 7d78e24307ff0563a9b848f70fd282f682eabe63 Mon Sep 17 00:00:00 2001 From: Guy David Date: Thu, 3 Jul 2025 03:21:39 +0300 Subject: [PATCH] [PHIElimination] Reuse existing COPY in predecessor basic block The insertion point of COPY isn't always optimal and could eventually lead to a worse block layout, see the regression test. This change affects many architectures but the amount of total instructions in the test cases seems too be slightly lower. --- llvm/lib/CodeGen/PHIElimination.cpp | 35 + ...aarch64_be-atomic-store-outline_atomics.ll | 16 +- .../Atomics/aarch64_be-atomic-store-rcpc.ll | 48 +- .../Atomics/aarch64_be-atomic-store-v8a.ll | 48 +- .../AArch64/PHIElimination-debugloc.mir | 2 +- .../AArch64/PHIElimination-reuse-copy.mir | 194 + .../AArch64/aarch64-matrix-umull-smull.ll | 2 +- llvm/test/CodeGen/AArch64/atomicrmw-O0.ll | 48 +- llvm/test/CodeGen/AArch64/bfis-in-loop.ll | 2 +- .../AArch64/complex-deinterleaving-crash.ll | 30 +- ...rleaving-reductions-predicated-scalable.ll | 64 +- ...plex-deinterleaving-reductions-scalable.ll | 22 +- .../complex-deinterleaving-reductions.ll | 12 +- .../AArch64/late-taildup-computed-goto.ll | 38 +- llvm/test/CodeGen/AArch64/phi.ll | 40 +- llvm/test/CodeGen/AArch64/pr48188.ll | 12 +- llvm/test/CodeGen/AArch64/ragreedy-csr.ll | 22 +- .../AArch64/ragreedy-local-interval-cost.ll | 113 +- llvm/test/CodeGen/AArch64/reduce-or-opt.ll | 24 +- llvm/test/CodeGen/AArch64/sink-and-fold.ll | 6 +- llvm/test/CodeGen/AArch64/sve-lsrchain.ll | 14 +- .../CodeGen/AArch64/sve-ptest-removal-sink.ll | 8 +- llvm/test/CodeGen/AArch64/swifterror.ll | 16 +- .../AMDGPU/GlobalISel/atomicrmw_fmax.ll | 146 +- .../AMDGPU/GlobalISel/atomicrmw_fmin.ll | 146 +- .../divergence-temporal-divergent-i1.ll | 14 +- llvm/test/CodeGen/AMDGPU/add_i1.ll | 13 +- .../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll | 45406 ++++++++-------- .../CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll | 154 +- .../CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll | 336 +- .../CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll | 406 +- .../CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll | 57 +- .../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll | 4218 +- .../CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll | 428 +- .../CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll | 224 +- .../CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll | 740 +- .../CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll | 416 +- .../CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll | 1290 +- .../CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll | 884 +- .../CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll | 2459 +- .../CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll | 78 +- .../atomic_optimizations_global_pointer.ll | 2229 +- .../branch-folding-implicit-def-subreg.ll | 210 +- .../AMDGPU/branch-relaxation-gfx1250.ll | 2 +- .../buffer-fat-pointer-atomicrmw-fadd.ll | 3307 +- .../buffer-fat-pointer-atomicrmw-fmax.ll | 2280 +- .../buffer-fat-pointer-atomicrmw-fmin.ll | 2280 +- llvm/test/CodeGen/AMDGPU/div_i128.ll | 1836 +- llvm/test/CodeGen/AMDGPU/div_v2i128.ll | 228 +- .../divergent-branch-uniform-condition.ll | 2 +- .../CodeGen/AMDGPU/flat-atomicrmw-fmax.ll | 62 +- .../CodeGen/AMDGPU/flat-atomicrmw-fmin.ll | 62 +- .../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 1870 +- .../CodeGen/AMDGPU/global-atomicrmw-fmax.ll | 1314 +- .../CodeGen/AMDGPU/global-atomicrmw-fmin.ll | 1314 +- .../CodeGen/AMDGPU/global-atomicrmw-fsub.ll | 1584 +- .../AMDGPU/global_atomics_i32_system.ll | 646 +- .../AMDGPU/global_atomics_i64_system.ll | 922 +- .../AMDGPU/global_atomics_scan_fadd.ll | 674 +- .../AMDGPU/global_atomics_scan_fmax.ll | 574 +- .../AMDGPU/global_atomics_scan_fmin.ll | 574 +- .../AMDGPU/global_atomics_scan_fsub.ll | 618 +- .../CodeGen/AMDGPU/indirect-addressing-si.ll | 139 +- .../AMDGPU/move-to-valu-atomicrmw-system.ll | 34 +- llvm/test/CodeGen/AMDGPU/mul.ll | 24 +- llvm/test/CodeGen/AMDGPU/rem_i128.ll | 1740 +- llvm/test/CodeGen/AMDGPU/sdiv64.ll | 236 +- llvm/test/CodeGen/AMDGPU/srem64.ll | 236 +- llvm/test/CodeGen/AMDGPU/sub_i1.ll | 13 +- llvm/test/CodeGen/AMDGPU/udiv64.ll | 212 +- llvm/test/CodeGen/AMDGPU/urem64.ll | 180 +- .../test/CodeGen/AMDGPU/vni8-across-blocks.ll | 83 +- llvm/test/CodeGen/AMDGPU/wave32.ll | 8 +- llvm/test/CodeGen/ARM/and-cmp0-sink.ll | 22 +- llvm/test/CodeGen/ARM/cttz.ll | 92 +- llvm/test/CodeGen/ARM/select-imm.ll | 16 +- llvm/test/CodeGen/ARM/struct-byval-loop.ll | 16 +- llvm/test/CodeGen/ARM/swifterror.ll | 308 +- llvm/test/CodeGen/AVR/bug-81911.ll | 34 +- .../CodeGen/Hexagon/swp-conv3x3-nested.ll | 3 +- llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll | 4 +- .../test/CodeGen/Hexagon/swp-matmul-bitext.ll | 2 +- llvm/test/CodeGen/Hexagon/swp-stages4.ll | 7 +- llvm/test/CodeGen/Hexagon/tinycore.ll | 11 +- .../LoongArch/machinelicm-address-pseudos.ll | 56 +- .../CodeGen/PowerPC/2013-07-01-PHIElimBug.mir | 3 +- .../test/CodeGen/PowerPC/check-zero-vector.ll | 42 +- .../CodeGen/PowerPC/disable-ctr-ppcf128.ll | 6 +- llvm/test/CodeGen/PowerPC/phi-eliminate.mir | 9 +- llvm/test/CodeGen/PowerPC/ppcf128-freeze.mir | 30 +- llvm/test/CodeGen/PowerPC/pr116071.ll | 25 +- llvm/test/CodeGen/PowerPC/sms-phi-2.ll | 13 +- llvm/test/CodeGen/PowerPC/sms-phi-3.ll | 24 +- .../PowerPC/stack-restore-with-setjmp.ll | 10 +- llvm/test/CodeGen/PowerPC/subreg-postra-2.ll | 18 +- llvm/test/CodeGen/PowerPC/vsx.ll | 3 +- llvm/test/CodeGen/RISCV/abds.ll | 200 +- llvm/test/CodeGen/RISCV/machine-pipeliner.ll | 24 +- .../RISCV/rvv/fixed-vectors-masked-gather.ll | 120 +- .../fixed-vectors-strided-load-store-asm.ll | 61 +- .../RISCV/rvv/vxrm-insert-out-of-loop.ll | 26 +- llvm/test/CodeGen/RISCV/xcvbi.ll | 60 +- llvm/test/CodeGen/SystemZ/swifterror.ll | 4 +- .../LowOverheadLoops/mve-tail-data-types.ll | 96 +- .../Thumb2/LowOverheadLoops/sibling-loops.ll | 12 +- .../tail-pred-disabled-in-loloops.ll | 44 +- .../varying-outer-2d-reduction.ll | 32 +- .../Thumb2/LowOverheadLoops/while-loops.ll | 111 +- .../test/CodeGen/Thumb2/mve-blockplacement.ll | 21 +- .../CodeGen/Thumb2/mve-float32regloops.ll | 43 +- .../Thumb2/mve-laneinterleaving-reduct.ll | 8 +- llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll | 101 +- llvm/test/CodeGen/Thumb2/mve-phireg.ll | 14 +- llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll | 85 +- llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll | 19 +- .../CodeGen/Thumb2/mve-postinc-distribute.ll | 17 +- llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll | 44 +- llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll | 33 +- llvm/test/CodeGen/Thumb2/pr52817.ll | 16 +- llvm/test/CodeGen/VE/Scalar/br_jt.ll | 38 +- .../X86/2012-01-10-UndefExceptionEdge.ll | 4 +- .../CodeGen/X86/AMX/amx-ldtilecfg-insert.ll | 18 +- llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll | 32 +- llvm/test/CodeGen/X86/atomic32.ll | 126 +- llvm/test/CodeGen/X86/atomic64.ll | 35 +- llvm/test/CodeGen/X86/atomic6432.ll | 72 +- .../CodeGen/X86/callbr-asm-branch-folding.ll | 8 +- llvm/test/CodeGen/X86/callbr-asm-kill.mir | 9 +- ...r-breaks-subreg-to-reg-liveness-reduced.ll | 2 +- llvm/test/CodeGen/X86/combine-pmuldq.ll | 8 +- llvm/test/CodeGen/X86/fp128-select.ll | 21 +- llvm/test/CodeGen/X86/madd.ll | 116 +- llvm/test/CodeGen/X86/masked_load.ll | 27 +- .../CodeGen/X86/min-legal-vector-width.ll | 30 +- llvm/test/CodeGen/X86/pcsections-atomics.ll | 296 +- llvm/test/CodeGen/X86/pr15705.ll | 17 +- llvm/test/CodeGen/X86/pr32256.ll | 12 +- llvm/test/CodeGen/X86/pr38795.ll | 15 +- llvm/test/CodeGen/X86/pr49451.ll | 6 +- llvm/test/CodeGen/X86/pr63108.ll | 2 +- llvm/test/CodeGen/X86/sad.ll | 26 +- llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll | 44 +- .../statepoint-cmp-sunk-past-statepoint.ll | 58 +- llvm/test/CodeGen/X86/swifterror.ll | 17 +- .../MIR/InstrRef/phi-regallocd-to-stack.mir | 7 +- .../AArch64/postidx-load.ll | 26 +- .../RISCV/lsr-drop-solution.ll | 18 +- 147 files changed, 43566 insertions(+), 43193 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/PHIElimination-reuse-copy.mir diff --git a/llvm/lib/CodeGen/PHIElimination.cpp b/llvm/lib/CodeGen/PHIElimination.cpp index a93a89ecaa96e..5a6ad417c6fba 100644 --- a/llvm/lib/CodeGen/PHIElimination.cpp +++ b/llvm/lib/CodeGen/PHIElimination.cpp @@ -15,6 +15,7 @@ #include "llvm/CodeGen/PHIElimination.h" #include "PHIEliminationUtils.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/LoopInfo.h" @@ -541,6 +542,7 @@ void PHIEliminationImpl::LowerPHINode(MachineBasicBlock &MBB, // Now loop over all of the incoming arguments, changing them to copy into the // IncomingReg register in the corresponding predecessor basic block. SmallPtrSet MBBsInsertedInto; + SmallVector InsertedCopies; for (int i = NumSrcs - 1; i >= 0; --i) { Register SrcReg = MPhi->getOperand(i * 2 + 1).getReg(); unsigned SrcSubReg = MPhi->getOperand(i * 2 + 1).getSubReg(); @@ -607,6 +609,7 @@ void PHIEliminationImpl::LowerPHINode(MachineBasicBlock &MBB, NewSrcInstr = TII->createPHISourceCopy(opBlock, InsertPos, nullptr, SrcReg, SrcSubReg, IncomingReg); } + InsertedCopies.emplace_back(NewSrcInstr); } // We only need to update the LiveVariables kill of SrcReg if this was the @@ -730,6 +733,38 @@ void PHIEliminationImpl::LowerPHINode(MachineBasicBlock &MBB, } } + // Remove redundant COPY instruction chains, which were potentially added by + // the code above. This can prevent future passes from complicating the CFG + // and cause a suboptimal block layout. + for (MachineInstr *NewCopy : InsertedCopies) { + if (NewCopy->isImplicitDef()) + continue; + Register IncomingReg = NewCopy->getOperand(0).getReg(); + if (!IncomingReg.isVirtual()) + continue; + Register SrcReg = NewCopy->getOperand(1).getReg(); + if (!MRI->hasOneNonDBGUse(SrcReg)) + continue; + MachineInstr *DefMI = MRI->getUniqueVRegDef(SrcReg); + if (!DefMI || !DefMI->isCopy() || + DefMI->getParent() != NewCopy->getParent()) + continue; + auto InstrRange = + make_range(std::next(DefMI->getIterator()), NewCopy->getIterator()); + if (any_of(InstrRange, [&](const MachineInstr &MI) { + return MI.readsVirtualRegister(IncomingReg); + })) + continue; + const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg); + const TargetRegisterClass *IncomingRC = MRI->getRegClass(IncomingReg); + if (!IncomingRC->hasSuperClassEq(SrcRC)) + continue; + MRI->replaceRegWith(SrcReg, IncomingReg); + NewCopy->removeFromParent(); + if (LV) + LV->getVarInfo(SrcReg).AliveBlocks.clear(); + } + // Really delete the PHI instruction now, if it is not in the LoweredPHIs map. if (EliminateNow) { if (LIS) diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-outline_atomics.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-outline_atomics.ll index c1c5c53aa7df2..6c300b04508b2 100644 --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-outline_atomics.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-outline_atomics.ll @@ -118,8 +118,8 @@ define dso_local void @store_atomic_i64_aligned_seq_cst(i64 %value, ptr %ptr) { define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_unordered: ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_unordered: ; -O1: ldxp xzr, x8, [x2] @@ -131,8 +131,8 @@ define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_monotonic: ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_monotonic: ; -O1: ldxp xzr, x8, [x2] @@ -144,8 +144,8 @@ define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_release: ; -O0: bl __aarch64_cas16_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_release: ; -O1: ldxp xzr, x8, [x2] @@ -157,8 +157,8 @@ define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr) define dso_local void @store_atomic_i128_aligned_seq_cst(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_seq_cst: ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_seq_cst: ; -O1: ldaxp xzr, x8, [x2] diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc.ll index d1047d84e2956..2a7bbad9d6454 100644 --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc.ll @@ -117,13 +117,13 @@ define dso_local void @store_atomic_i64_aligned_seq_cst(i64 %value, ptr %ptr) { define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_unordered: -; -O0: ldxp x10, x12, [x9] +; -O0: ldxp x8, x10, [x13] +; -O0: cmp x8, x9 ; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x10, [x13] +; -O0: subs x10, x10, x11 +; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_unordered: ; -O1: ldxp xzr, x8, [x2] @@ -134,13 +134,13 @@ define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_monotonic: -; -O0: ldxp x10, x12, [x9] +; -O0: ldxp x8, x10, [x13] +; -O0: cmp x8, x9 ; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x10, [x13] +; -O0: subs x10, x10, x11 +; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_monotonic: ; -O1: ldxp xzr, x8, [x2] @@ -151,13 +151,13 @@ define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_release: -; -O0: ldxp x10, x12, [x9] +; -O0: ldxp x8, x10, [x13] +; -O0: cmp x8, x9 ; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x10, [x13] +; -O0: subs x10, x10, x11 +; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_release: ; -O1: ldxp xzr, x8, [x2] @@ -168,13 +168,13 @@ define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr) define dso_local void @store_atomic_i128_aligned_seq_cst(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_seq_cst: -; -O0: ldaxp x10, x12, [x9] +; -O0: ldaxp x8, x10, [x13] +; -O0: cmp x8, x9 ; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x10, [x13] +; -O0: subs x10, x10, x11 +; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_seq_cst: ; -O1: ldaxp xzr, x8, [x2] diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-v8a.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-v8a.ll index 1a79c73355143..493bc742f7663 100644 --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-v8a.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-v8a.ll @@ -117,13 +117,13 @@ define dso_local void @store_atomic_i64_aligned_seq_cst(i64 %value, ptr %ptr) { define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_unordered: -; -O0: ldxp x10, x12, [x9] +; -O0: ldxp x8, x10, [x13] +; -O0: cmp x8, x9 ; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x10, [x13] +; -O0: subs x10, x10, x11 +; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_unordered: ; -O1: ldxp xzr, x8, [x2] @@ -134,13 +134,13 @@ define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_monotonic: -; -O0: ldxp x10, x12, [x9] +; -O0: ldxp x8, x10, [x13] +; -O0: cmp x8, x9 ; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x10, [x13] +; -O0: subs x10, x10, x11 +; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_monotonic: ; -O1: ldxp xzr, x8, [x2] @@ -151,13 +151,13 @@ define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_release: -; -O0: ldxp x10, x12, [x9] +; -O0: ldxp x8, x10, [x13] +; -O0: cmp x8, x9 ; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x10, [x13] +; -O0: subs x10, x10, x11 +; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_release: ; -O1: ldxp xzr, x8, [x2] @@ -168,13 +168,13 @@ define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr) define dso_local void @store_atomic_i128_aligned_seq_cst(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_seq_cst: -; -O0: ldaxp x10, x12, [x9] +; -O0: ldaxp x8, x10, [x13] +; -O0: cmp x8, x9 ; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x10, [x13] +; -O0: subs x10, x10, x11 +; -O0: ccmp x8, x9, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_seq_cst: ; -O1: ldaxp xzr, x8, [x2] diff --git a/llvm/test/CodeGen/AArch64/PHIElimination-debugloc.mir b/llvm/test/CodeGen/AArch64/PHIElimination-debugloc.mir index 01c44e3f253bb..993d1c1f1b5f0 100644 --- a/llvm/test/CodeGen/AArch64/PHIElimination-debugloc.mir +++ b/llvm/test/CodeGen/AArch64/PHIElimination-debugloc.mir @@ -37,7 +37,7 @@ body: | bb.1: %x:gpr32 = COPY $wzr ; Test that the debug location is not copied into bb1! - ; CHECK: %3:gpr32 = COPY killed %x{{$}} + ; CHECK: %3:gpr32 = COPY $wzr ; CHECK-LABEL: bb.2: bb.2: %y:gpr32 = PHI %x:gpr32, %bb.1, undef %undef:gpr32, %bb.0, debug-location !14 diff --git a/llvm/test/CodeGen/AArch64/PHIElimination-reuse-copy.mir b/llvm/test/CodeGen/AArch64/PHIElimination-reuse-copy.mir new file mode 100644 index 0000000000000..fb85a0dbff30d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/PHIElimination-reuse-copy.mir @@ -0,0 +1,194 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -run-pass=livevars,phi-node-elimination -verify-machineinstrs -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s + +# Verify that the original COPY in bb.1 is reappropriated as the PHI source in bb.2, +# instead of creating a new COPY with the same source register. + +--- +name: copy_virtual_reg +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: copy_virtual_reg + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $nzcv, $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %a:gpr32 = COPY killed $w0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:gpr32 = IMPLICIT_DEF + ; CHECK-NEXT: Bcc 8, %bb.2, implicit killed $nzcv + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:gpr32 = COPY killed %a + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: %c:gpr32 = COPY killed [[DEF]] + ; CHECK-NEXT: dead %d:gpr32 = COPY killed %c + bb.0: + liveins: $nzcv, $w0 + %a:gpr32 = COPY $w0 + Bcc 8, %bb.2, implicit $nzcv + bb.1: + %b:gpr32 = COPY %a:gpr32 + bb.2: + %c:gpr32 = PHI %b:gpr32, %bb.1, undef %undef:gpr32, %bb.0 + %d:gpr32 = COPY %c:gpr32 +... + +--- +name: copy_physical_reg +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: copy_physical_reg + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $nzcv, $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:gpr32 = IMPLICIT_DEF + ; CHECK-NEXT: Bcc 8, %bb.2, implicit killed $nzcv + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: dead $x0 = IMPLICIT_DEF implicit-def $w0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:gpr32 = COPY killed $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: dead %b:gpr32 = COPY killed [[DEF]] + bb.0: + liveins: $nzcv, $w0 + Bcc 8, %bb.2, implicit $nzcv + bb.1: + $x0 = IMPLICIT_DEF + %a:gpr32 = COPY $w0 + bb.2: + %b:gpr32 = PHI %a:gpr32, %bb.1, undef %undef:gpr32, %bb.0 +... + +--- +name: copy_to_dead +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: copy_to_dead + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $wzr, $xzr + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $wzr + ; CHECK-NEXT: dead [[COPY1:%[0-9]+]]:gpr64 = COPY $xzr + ; CHECK-NEXT: TBZW killed [[COPY]], 0, %bb.2 + ; CHECK-NEXT: B %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: dead [[DEF:%[0-9]+]]:gpr64 = IMPLICIT_DEF + ; CHECK-NEXT: dead [[DEF1:%[0-9]+]]:gpr64 = IMPLICIT_DEF + ; CHECK-NEXT: B %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: dead [[DEF2:%[0-9]+]]:gpr64 = IMPLICIT_DEF + ; CHECK-NEXT: dead [[DEF3:%[0-9]+]]:gpr64 = IMPLICIT_DEF + ; CHECK-NEXT: B %bb.1 + bb.0: + liveins: $wzr, $xzr + + %9:gpr32 = COPY $wzr + dead %5:gpr64 = COPY $xzr + TBZW killed %9:gpr32, 0, %bb.2 + B %bb.1 + + bb.1: + successors: %bb.2(0x80000000); %bb.2(100.00%) + + dead %1:gpr64 = PHI undef %3:gpr64, %bb.2, undef %5:gpr64, %bb.0 + dead %2:gpr64 = PHI undef %4:gpr64, %bb.2, undef %5:gpr64, %bb.0 + B %bb.2 + + bb.2: + successors: %bb.1(0x80000000); %bb.1(100.00%) + + dead %3:gpr64 = PHI undef %1:gpr64, %bb.1, undef %5:gpr64, %bb.0 + dead %4:gpr64 = PHI undef %2:gpr64, %bb.1, undef %5:gpr64, %bb.0 + B %bb.1 + +... + +--- +name: update_livevars +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: update_livevars + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $w0, $w1, $nzcv + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY killed $w0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY killed $w1 + ; CHECK-NEXT: B %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $nzcv + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: dead [[COPY2:%[0-9]+]]:gpr32 = COPY killed [[COPY1]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr32 = COPY [[COPY]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY [[COPY3]] + ; CHECK-NEXT: Bcc 1, %bb.1, implicit $nzcv + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $nzcv + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY killed [[COPY3]] + ; CHECK-NEXT: B %bb.1 + bb.0: + successors: %bb.1 + liveins: $w0, $w1, $nzcv + + %0:gpr32 = COPY killed $w0 + %1:gpr32 = COPY killed $w1 + B %bb.1 + + bb.1: + successors: %bb.2, %bb.1 + liveins: $nzcv + + %2:gpr32 = PHI %3, %bb.2, %1, %bb.0, %3, %bb.1 + %3:gpr32 = COPY %0 + Bcc 1, %bb.1, implicit $nzcv + + bb.2: + successors: %bb.1 + liveins: $nzcv + + B %bb.1 +... + +--- +name: copy_subreg +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: copy_subreg + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY killed $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY killed [[COPY]] + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: dead [[COPY2:%[0-9]+]]:gpr32 = COPY killed [[COPY1]].sub_32 + bb.0: + successors: %bb.1 + liveins: $x0 + + %0:gpr64 = COPY killed $x0 + %1:gpr64 = COPY killed %0 + + bb.1: + %2:gpr32 = PHI %1.sub_32, %bb.0 +... diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll index 8655bb1292ef7..ca1052a769408 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll @@ -583,8 +583,8 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A, ; CHECK-SD-NEXT: mov w10, w2 ; CHECK-SD-NEXT: b.hi .LBB5_4 ; CHECK-SD-NEXT: // %bb.2: -; CHECK-SD-NEXT: mov x11, xzr ; CHECK-SD-NEXT: mov w8, wzr +; CHECK-SD-NEXT: mov x11, xzr ; CHECK-SD-NEXT: b .LBB5_7 ; CHECK-SD-NEXT: .LBB5_3: ; CHECK-SD-NEXT: mov w8, wzr diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll b/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll index 71e0250b36972..9fd27edae3176 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll @@ -199,16 +199,16 @@ define i128 @test_rmw_add_128(ptr %dst) { ; NOLSE-NEXT: sub sp, sp, #48 ; NOLSE-NEXT: .cfi_def_cfa_offset 48 ; NOLSE-NEXT: str x0, [sp, #24] // 8-byte Folded Spill -; NOLSE-NEXT: ldr x8, [x0, #8] -; NOLSE-NEXT: ldr x9, [x0] +; NOLSE-NEXT: ldr x9, [x0, #8] +; NOLSE-NEXT: ldr x8, [x0] ; NOLSE-NEXT: str x9, [sp, #32] // 8-byte Folded Spill ; NOLSE-NEXT: str x8, [sp, #40] // 8-byte Folded Spill ; NOLSE-NEXT: b .LBB4_1 ; NOLSE-NEXT: .LBB4_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Loop Header: Depth=1 ; NOLSE-NEXT: // Child Loop BB4_2 Depth 2 -; NOLSE-NEXT: ldr x13, [sp, #40] // 8-byte Folded Reload -; NOLSE-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload +; NOLSE-NEXT: ldr x13, [sp, #32] // 8-byte Folded Reload +; NOLSE-NEXT: ldr x11, [sp, #40] // 8-byte Folded Reload ; NOLSE-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload ; NOLSE-NEXT: adds x14, x11, #1 ; NOLSE-NEXT: cinc x15, x13, hs @@ -238,8 +238,8 @@ define i128 @test_rmw_add_128(ptr %dst) { ; NOLSE-NEXT: str x9, [sp, #16] // 8-byte Folded Spill ; NOLSE-NEXT: subs x12, x12, x13 ; NOLSE-NEXT: ccmp x10, x11, #0, eq -; NOLSE-NEXT: str x9, [sp, #32] // 8-byte Folded Spill -; NOLSE-NEXT: str x8, [sp, #40] // 8-byte Folded Spill +; NOLSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill +; NOLSE-NEXT: str x8, [sp, #32] // 8-byte Folded Spill ; NOLSE-NEXT: b.ne .LBB4_1 ; NOLSE-NEXT: b .LBB4_6 ; NOLSE-NEXT: .LBB4_6: // %atomicrmw.end @@ -253,15 +253,15 @@ define i128 @test_rmw_add_128(ptr %dst) { ; LSE-NEXT: sub sp, sp, #48 ; LSE-NEXT: .cfi_def_cfa_offset 48 ; LSE-NEXT: str x0, [sp, #24] // 8-byte Folded Spill -; LSE-NEXT: ldr x8, [x0, #8] -; LSE-NEXT: ldr x9, [x0] +; LSE-NEXT: ldr x9, [x0, #8] +; LSE-NEXT: ldr x8, [x0] ; LSE-NEXT: str x9, [sp, #32] // 8-byte Folded Spill ; LSE-NEXT: str x8, [sp, #40] // 8-byte Folded Spill ; LSE-NEXT: b .LBB4_1 ; LSE-NEXT: .LBB4_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: ldr x11, [sp, #40] // 8-byte Folded Reload -; LSE-NEXT: ldr x10, [sp, #32] // 8-byte Folded Reload +; LSE-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload +; LSE-NEXT: ldr x10, [sp, #40] // 8-byte Folded Reload ; LSE-NEXT: ldr x8, [sp, #24] // 8-byte Folded Reload ; LSE-NEXT: mov x0, x10 ; LSE-NEXT: mov x1, x11 @@ -276,8 +276,8 @@ define i128 @test_rmw_add_128(ptr %dst) { ; LSE-NEXT: str x8, [sp, #16] // 8-byte Folded Spill ; LSE-NEXT: subs x11, x8, x11 ; LSE-NEXT: ccmp x9, x10, #0, eq -; LSE-NEXT: str x9, [sp, #32] // 8-byte Folded Spill -; LSE-NEXT: str x8, [sp, #40] // 8-byte Folded Spill +; LSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill +; LSE-NEXT: str x8, [sp, #32] // 8-byte Folded Spill ; LSE-NEXT: b.ne .LBB4_1 ; LSE-NEXT: b .LBB4_2 ; LSE-NEXT: .LBB4_2: // %atomicrmw.end @@ -573,16 +573,16 @@ define i128 @test_rmw_nand_128(ptr %dst) { ; NOLSE-NEXT: sub sp, sp, #48 ; NOLSE-NEXT: .cfi_def_cfa_offset 48 ; NOLSE-NEXT: str x0, [sp, #24] // 8-byte Folded Spill -; NOLSE-NEXT: ldr x8, [x0, #8] -; NOLSE-NEXT: ldr x9, [x0] +; NOLSE-NEXT: ldr x9, [x0, #8] +; NOLSE-NEXT: ldr x8, [x0] ; NOLSE-NEXT: str x9, [sp, #32] // 8-byte Folded Spill ; NOLSE-NEXT: str x8, [sp, #40] // 8-byte Folded Spill ; NOLSE-NEXT: b .LBB9_1 ; NOLSE-NEXT: .LBB9_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Loop Header: Depth=1 ; NOLSE-NEXT: // Child Loop BB9_2 Depth 2 -; NOLSE-NEXT: ldr x13, [sp, #40] // 8-byte Folded Reload -; NOLSE-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload +; NOLSE-NEXT: ldr x13, [sp, #32] // 8-byte Folded Reload +; NOLSE-NEXT: ldr x11, [sp, #40] // 8-byte Folded Reload ; NOLSE-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload ; NOLSE-NEXT: mov w8, w11 ; NOLSE-NEXT: mvn w10, w8 @@ -616,8 +616,8 @@ define i128 @test_rmw_nand_128(ptr %dst) { ; NOLSE-NEXT: str x9, [sp, #16] // 8-byte Folded Spill ; NOLSE-NEXT: subs x12, x12, x13 ; NOLSE-NEXT: ccmp x10, x11, #0, eq -; NOLSE-NEXT: str x9, [sp, #32] // 8-byte Folded Spill -; NOLSE-NEXT: str x8, [sp, #40] // 8-byte Folded Spill +; NOLSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill +; NOLSE-NEXT: str x8, [sp, #32] // 8-byte Folded Spill ; NOLSE-NEXT: b.ne .LBB9_1 ; NOLSE-NEXT: b .LBB9_6 ; NOLSE-NEXT: .LBB9_6: // %atomicrmw.end @@ -631,15 +631,15 @@ define i128 @test_rmw_nand_128(ptr %dst) { ; LSE-NEXT: sub sp, sp, #48 ; LSE-NEXT: .cfi_def_cfa_offset 48 ; LSE-NEXT: str x0, [sp, #24] // 8-byte Folded Spill -; LSE-NEXT: ldr x8, [x0, #8] -; LSE-NEXT: ldr x9, [x0] +; LSE-NEXT: ldr x9, [x0, #8] +; LSE-NEXT: ldr x8, [x0] ; LSE-NEXT: str x9, [sp, #32] // 8-byte Folded Spill ; LSE-NEXT: str x8, [sp, #40] // 8-byte Folded Spill ; LSE-NEXT: b .LBB9_1 ; LSE-NEXT: .LBB9_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: ldr x11, [sp, #40] // 8-byte Folded Reload -; LSE-NEXT: ldr x10, [sp, #32] // 8-byte Folded Reload +; LSE-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload +; LSE-NEXT: ldr x10, [sp, #40] // 8-byte Folded Reload ; LSE-NEXT: ldr x8, [sp, #24] // 8-byte Folded Reload ; LSE-NEXT: mov x0, x10 ; LSE-NEXT: mov x1, x11 @@ -658,8 +658,8 @@ define i128 @test_rmw_nand_128(ptr %dst) { ; LSE-NEXT: str x8, [sp, #16] // 8-byte Folded Spill ; LSE-NEXT: subs x11, x8, x11 ; LSE-NEXT: ccmp x9, x10, #0, eq -; LSE-NEXT: str x9, [sp, #32] // 8-byte Folded Spill -; LSE-NEXT: str x8, [sp, #40] // 8-byte Folded Spill +; LSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill +; LSE-NEXT: str x8, [sp, #32] // 8-byte Folded Spill ; LSE-NEXT: b.ne .LBB9_1 ; LSE-NEXT: b .LBB9_2 ; LSE-NEXT: .LBB9_2: // %atomicrmw.end diff --git a/llvm/test/CodeGen/AArch64/bfis-in-loop.ll b/llvm/test/CodeGen/AArch64/bfis-in-loop.ll index 43d49da1abd21..b0339222bc2df 100644 --- a/llvm/test/CodeGen/AArch64/bfis-in-loop.ll +++ b/llvm/test/CodeGen/AArch64/bfis-in-loop.ll @@ -14,8 +14,8 @@ define i64 @bfis_in_loop_zero() { ; CHECK-LABEL: bfis_in_loop_zero: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: adrp x9, :got:global -; CHECK-NEXT: mov x0, xzr ; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: mov x0, xzr ; CHECK-NEXT: ldr x9, [x9, :got_lo12:global] ; CHECK-NEXT: mov w10, #65536 // =0x10000 ; CHECK-NEXT: ldr x9, [x9] diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll index 7542e9c4b8f5b..327d0749c7dbf 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll @@ -35,10 +35,10 @@ define i32 @check_deinterleaving_has_deinterleave(ptr %a) { ; CHECK-LABEL: check_deinterleaving_has_deinterleave: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: movi v2.4s, #1 ; CHECK-NEXT: add x8, x0, #16 ; CHECK-NEXT: movi v3.2d, #0000000000000000 -; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: mov w9, #32 // =0x20 ; CHECK-NEXT: movi v4.2d, #0000000000000000 ; CHECK-NEXT: movi v5.2d, #0000000000000000 @@ -64,16 +64,16 @@ define i32 @check_deinterleaving_has_deinterleave(ptr %a) { ; CHECK-NEXT: ushll v24.4s, v18.4h, #0 ; CHECK-NEXT: ushll2 v18.4s, v18.8h, #0 ; CHECK-NEXT: ushll v20.4s, v20.4h, #0 -; CHECK-NEXT: and v21.16b, v21.16b, v1.16b -; CHECK-NEXT: and v19.16b, v19.16b, v1.16b -; CHECK-NEXT: and v22.16b, v22.16b, v1.16b -; CHECK-NEXT: and v17.16b, v17.16b, v1.16b -; CHECK-NEXT: and v23.16b, v23.16b, v1.16b -; CHECK-NEXT: and v24.16b, v24.16b, v1.16b -; CHECK-NEXT: and v18.16b, v18.16b, v1.16b -; CHECK-NEXT: and v20.16b, v20.16b, v1.16b +; CHECK-NEXT: and v21.16b, v21.16b, v2.16b +; CHECK-NEXT: and v19.16b, v19.16b, v2.16b +; CHECK-NEXT: and v22.16b, v22.16b, v2.16b +; CHECK-NEXT: and v17.16b, v17.16b, v2.16b +; CHECK-NEXT: and v23.16b, v23.16b, v2.16b +; CHECK-NEXT: and v24.16b, v24.16b, v2.16b +; CHECK-NEXT: and v18.16b, v18.16b, v2.16b +; CHECK-NEXT: and v20.16b, v20.16b, v2.16b ; CHECK-NEXT: add v4.4s, v4.4s, v19.4s -; CHECK-NEXT: add v2.4s, v2.4s, v21.4s +; CHECK-NEXT: add v1.4s, v1.4s, v21.4s ; CHECK-NEXT: add v0.4s, v0.4s, v22.4s ; CHECK-NEXT: add v3.4s, v3.4s, v17.4s ; CHECK-NEXT: add v16.4s, v16.4s, v23.4s @@ -82,12 +82,12 @@ define i32 @check_deinterleaving_has_deinterleave(ptr %a) { ; CHECK-NEXT: add v7.4s, v7.4s, v18.4s ; CHECK-NEXT: b.ne .LBB1_1 ; CHECK-NEXT: // %bb.2: // %middle.block -; CHECK-NEXT: add v1.4s, v7.4s, v3.4s +; CHECK-NEXT: add v2.4s, v7.4s, v3.4s ; CHECK-NEXT: add v3.4s, v16.4s, v4.4s ; CHECK-NEXT: add v0.4s, v5.4s, v0.4s -; CHECK-NEXT: add v2.4s, v6.4s, v2.4s -; CHECK-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: add v1.4s, v6.4s, v1.4s +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: add v1.4s, v2.4s, v3.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll index d67aa08125f74..42b7557c6ecb5 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll @@ -19,33 +19,33 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) { ; CHECK-NEXT: mov w8, #100 // =0x64 ; CHECK-NEXT: whilelo p1.d, xzr, x8 ; CHECK-NEXT: cntd x9 -; CHECK-NEXT: rdvl x10, #2 +; CHECK-NEXT: rdvl x11, #2 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov x11, x9 +; CHECK-NEXT: mov x10, x9 ; CHECK-NEXT: .LBB0_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: zip2 p2.d, p1.d, p1.d -; CHECK-NEXT: mov z6.d, z0.d -; CHECK-NEXT: mov z7.d, z1.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: mov z7.d, z0.d ; CHECK-NEXT: zip1 p1.d, p1.d, p1.d ; CHECK-NEXT: ld1d { z2.d }, p2/z, [x0, #1, mul vl] ; CHECK-NEXT: ld1d { z4.d }, p2/z, [x1, #1, mul vl] ; CHECK-NEXT: ld1d { z3.d }, p1/z, [x0] ; CHECK-NEXT: ld1d { z5.d }, p1/z, [x1] -; CHECK-NEXT: add x1, x1, x10 -; CHECK-NEXT: add x0, x0, x10 +; CHECK-NEXT: add x1, x1, x11 +; CHECK-NEXT: add x0, x0, x11 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90 -; CHECK-NEXT: mov z1.d, p2/m, z7.d -; CHECK-NEXT: mov z0.d, p1/m, z6.d -; CHECK-NEXT: whilelo p1.d, x11, x8 -; CHECK-NEXT: add x11, x11, x9 +; CHECK-NEXT: mov z0.d, p2/m, z7.d +; CHECK-NEXT: mov z1.d, p1/m, z6.d +; CHECK-NEXT: whilelo p1.d, x10, x8 +; CHECK-NEXT: add x10, x10, x9 ; CHECK-NEXT: b.mi .LBB0_1 ; CHECK-NEXT: // %bb.2: // %exit.block -; CHECK-NEXT: uzp1 z2.d, z0.d, z1.d -; CHECK-NEXT: uzp2 z1.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d +; CHECK-NEXT: uzp2 z1.d, z1.d, z0.d ; CHECK-NEXT: faddv d0, p0, z2.d ; CHECK-NEXT: faddv d1, p0, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -122,8 +122,8 @@ define %"class.std::complex" @complex_mul_predicated_v2f64(ptr %a, ptr %b, ptr % ; CHECK-NEXT: .LBB1_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld1w { z2.d }, p0/z, [x2, x8, lsl #2] -; CHECK-NEXT: mov z6.d, z0.d -; CHECK-NEXT: mov z7.d, z1.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: mov z7.d, z0.d ; CHECK-NEXT: add x8, x8, x9 ; CHECK-NEXT: cmpne p1.d, p0/z, z2.d, #0 ; CHECK-NEXT: cmp x10, x8 @@ -139,12 +139,12 @@ define %"class.std::complex" @complex_mul_predicated_v2f64(ptr %a, ptr %b, ptr % ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90 -; CHECK-NEXT: mov z1.d, p2/m, z7.d -; CHECK-NEXT: mov z0.d, p1/m, z6.d +; CHECK-NEXT: mov z0.d, p2/m, z7.d +; CHECK-NEXT: mov z1.d, p1/m, z6.d ; CHECK-NEXT: b.ne .LBB1_1 ; CHECK-NEXT: // %bb.2: // %exit.block -; CHECK-NEXT: uzp1 z2.d, z0.d, z1.d -; CHECK-NEXT: uzp2 z1.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d +; CHECK-NEXT: uzp2 z1.d, z1.d, z0.d ; CHECK-NEXT: faddv d0, p0, z2.d ; CHECK-NEXT: faddv d1, p0, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -216,16 +216,16 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt ; CHECK-NEXT: mov w8, #100 // =0x64 ; CHECK-NEXT: whilelo p1.d, xzr, x8 ; CHECK-NEXT: cntd x9 -; CHECK-NEXT: rdvl x10, #2 +; CHECK-NEXT: rdvl x11, #2 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: cnth x11 -; CHECK-NEXT: mov x12, x9 +; CHECK-NEXT: mov x10, x9 +; CHECK-NEXT: cnth x12 ; CHECK-NEXT: .LBB2_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld1w { z2.d }, p1/z, [x2] -; CHECK-NEXT: mov z6.d, z0.d -; CHECK-NEXT: mov z7.d, z1.d -; CHECK-NEXT: add x2, x2, x11 +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: mov z7.d, z0.d +; CHECK-NEXT: add x2, x2, x12 ; CHECK-NEXT: and z2.d, z2.d, #0xffffffff ; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; CHECK-NEXT: zip2 p2.d, p1.d, p1.d @@ -234,20 +234,20 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt ; CHECK-NEXT: ld1d { z4.d }, p2/z, [x1, #1, mul vl] ; CHECK-NEXT: ld1d { z3.d }, p1/z, [x0] ; CHECK-NEXT: ld1d { z5.d }, p1/z, [x1] -; CHECK-NEXT: add x1, x1, x10 -; CHECK-NEXT: add x0, x0, x10 +; CHECK-NEXT: add x1, x1, x11 +; CHECK-NEXT: add x0, x0, x11 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90 -; CHECK-NEXT: mov z1.d, p2/m, z7.d -; CHECK-NEXT: mov z0.d, p1/m, z6.d -; CHECK-NEXT: whilelo p1.d, x12, x8 -; CHECK-NEXT: add x12, x12, x9 +; CHECK-NEXT: mov z0.d, p2/m, z7.d +; CHECK-NEXT: mov z1.d, p1/m, z6.d +; CHECK-NEXT: whilelo p1.d, x10, x8 +; CHECK-NEXT: add x10, x10, x9 ; CHECK-NEXT: b.mi .LBB2_1 ; CHECK-NEXT: // %bb.2: // %exit.block -; CHECK-NEXT: uzp1 z2.d, z0.d, z1.d -; CHECK-NEXT: uzp2 z1.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d +; CHECK-NEXT: uzp2 z1.d, z1.d, z0.d ; CHECK-NEXT: faddv d0, p0, z2.d ; CHECK-NEXT: faddv d1, p0, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll index 0646ca4948e1d..663079401c75b 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll @@ -31,14 +31,14 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) { ; CHECK-NEXT: ldr z5, [x1] ; CHECK-NEXT: add x1, x1, x10 ; CHECK-NEXT: add x0, x0, x10 -; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #0 -; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #0 -; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #90 -; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #90 +; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #0 +; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #0 +; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #90 +; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #90 ; CHECK-NEXT: b.ne .LBB0_1 ; CHECK-NEXT: // %bb.2: // %exit.block -; CHECK-NEXT: uzp1 z2.d, z0.d, z1.d -; CHECK-NEXT: uzp2 z1.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d +; CHECK-NEXT: uzp2 z1.d, z1.d, z0.d ; CHECK-NEXT: faddv d0, p0, z2.d ; CHECK-NEXT: faddv d1, p0, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -307,8 +307,8 @@ exit.block: ; preds = %vector.body define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalias nocapture noundef readnone %c, [2 x double] %d.coerce, ptr nocapture noundef readonly %s, ptr nocapture noundef writeonly %outs) local_unnamed_addr #0 { ; CHECK-LABEL: reduction_mix: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: cntd x9 ; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: neg x10, x9 @@ -325,13 +325,13 @@ define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalia ; CHECK-NEXT: ld1w { z5.d }, p0/z, [x3, x8, lsl #2] ; CHECK-NEXT: add x8, x8, x9 ; CHECK-NEXT: cmp x10, x8 -; CHECK-NEXT: fadd z1.d, z4.d, z1.d -; CHECK-NEXT: fadd z0.d, z3.d, z0.d +; CHECK-NEXT: fadd z0.d, z4.d, z0.d +; CHECK-NEXT: fadd z1.d, z3.d, z1.d ; CHECK-NEXT: add z2.d, z5.d, z2.d ; CHECK-NEXT: b.ne .LBB3_1 ; CHECK-NEXT: // %bb.2: // %middle.block -; CHECK-NEXT: uzp2 z3.d, z0.d, z1.d -; CHECK-NEXT: uzp1 z1.d, z0.d, z1.d +; CHECK-NEXT: uzp2 z3.d, z1.d, z0.d +; CHECK-NEXT: uzp1 z1.d, z1.d, z0.d ; CHECK-NEXT: uaddv d2, p0, z2.d ; CHECK-NEXT: faddv d0, p0, z3.d ; CHECK-NEXT: faddv d1, p0, z1.d diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll index aed3072bb4af3..c977869d2ce95 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll @@ -25,14 +25,14 @@ define dso_local %"struct.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q3, q2, [x9] ; CHECK-NEXT: cmp x8, #1600 ; CHECK-NEXT: ldp q5, q4, [x10] -; CHECK-NEXT: fcmla v0.2d, v5.2d, v3.2d, #0 -; CHECK-NEXT: fcmla v1.2d, v4.2d, v2.2d, #0 -; CHECK-NEXT: fcmla v0.2d, v5.2d, v3.2d, #90 -; CHECK-NEXT: fcmla v1.2d, v4.2d, v2.2d, #90 +; CHECK-NEXT: fcmla v1.2d, v5.2d, v3.2d, #0 +; CHECK-NEXT: fcmla v0.2d, v4.2d, v2.2d, #0 +; CHECK-NEXT: fcmla v1.2d, v5.2d, v3.2d, #90 +; CHECK-NEXT: fcmla v0.2d, v4.2d, v2.2d, #90 ; CHECK-NEXT: b.ne .LBB0_1 ; CHECK-NEXT: // %bb.2: // %middle.block -; CHECK-NEXT: zip2 v2.2d, v0.2d, v1.2d -; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: zip2 v2.2d, v1.2d, v0.2d +; CHECK-NEXT: zip1 v0.2d, v1.2d, v0.2d ; CHECK-NEXT: faddp d0, v0.2d ; CHECK-NEXT: faddp d1, v2.2d ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll b/llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll index 381904f776604..92d5c595715d8 100644 --- a/llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll +++ b/llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll @@ -25,21 +25,21 @@ define void @test_interp(ptr %frame, ptr %dst) { ; CHECK-NEXT: adrp x21, _opcode.targets@PAGE ; CHECK-NEXT: Lloh1: ; CHECK-NEXT: add x21, x21, _opcode.targets@PAGEOFF -; CHECK-NEXT: mov x24, xzr +; CHECK-NEXT: mov x22, xzr ; CHECK-NEXT: add x8, x21, xzr, lsl #3 ; CHECK-NEXT: mov x19, x1 ; CHECK-NEXT: mov x20, x0 -; CHECK-NEXT: mov x23, xzr -; CHECK-NEXT: mov w22, #1 ; =0x1 -; CHECK-NEXT: add x24, x24, #1 +; CHECK-NEXT: mov x24, xzr +; CHECK-NEXT: mov w23, #1 ; =0x1 +; CHECK-NEXT: add x22, x22, #1 ; CHECK-NEXT: br x8 ; CHECK-NEXT: Ltmp0: ; Block address taken ; CHECK-NEXT: LBB0_1: ; %loop.header ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add x8, x21, x24, lsl #3 +; CHECK-NEXT: add x8, x21, x22, lsl #3 ; CHECK-NEXT: mov x20, xzr -; CHECK-NEXT: mov x23, xzr -; CHECK-NEXT: add x24, x24, #1 +; CHECK-NEXT: mov x24, xzr +; CHECK-NEXT: add x22, x22, #1 ; CHECK-NEXT: br x8 ; CHECK-NEXT: Ltmp1: ; Block address taken ; CHECK-NEXT: LBB0_2: ; %op1.bb @@ -49,34 +49,34 @@ define void @test_interp(ptr %frame, ptr %dst) { ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr x0, [x20, #-8]! ; CHECK-NEXT: ldr x8, [x0, #8] -; CHECK-NEXT: str x22, [x0] +; CHECK-NEXT: str x23, [x0] ; CHECK-NEXT: ldr x8, [x8, #48] ; CHECK-NEXT: blr x8 -; CHECK-NEXT: add x8, x21, x24, lsl #3 -; CHECK-NEXT: add x24, x24, #1 +; CHECK-NEXT: add x8, x21, x22, lsl #3 +; CHECK-NEXT: add x22, x22, #1 ; CHECK-NEXT: br x8 ; CHECK-NEXT: Ltmp3: ; Block address taken ; CHECK-NEXT: LBB0_4: ; %op2.bb ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add x8, x21, x24, lsl #3 +; CHECK-NEXT: add x8, x21, x22, lsl #3 ; CHECK-NEXT: mov x20, xzr -; CHECK-NEXT: str x23, [x19] -; CHECK-NEXT: mov x23, xzr -; CHECK-NEXT: add x24, x24, #1 +; CHECK-NEXT: str x24, [x19] +; CHECK-NEXT: mov x24, xzr +; CHECK-NEXT: add x22, x22, #1 ; CHECK-NEXT: br x8 ; CHECK-NEXT: Ltmp4: ; Block address taken ; CHECK-NEXT: LBB0_5: ; %op4.bb ; CHECK-NEXT: Ltmp5: ; Block address taken ; CHECK-NEXT: LBB0_6: ; %op5.bb ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str x23, [x19] -; CHECK-NEXT: ldur x8, [x23, #12] +; CHECK-NEXT: str x24, [x19] +; CHECK-NEXT: ldur x8, [x24, #12] ; CHECK-NEXT: ldur x9, [x20, #-8] -; CHECK-NEXT: add x23, x23, #20 +; CHECK-NEXT: add x24, x24, #20 ; CHECK-NEXT: stp x8, x9, [x20, #-8] -; CHECK-NEXT: add x8, x21, x24, lsl #3 +; CHECK-NEXT: add x8, x21, x22, lsl #3 ; CHECK-NEXT: add x20, x20, #8 -; CHECK-NEXT: add x24, x24, #1 +; CHECK-NEXT: add x22, x22, #1 ; CHECK-NEXT: br x8 ; CHECK-NEXT: .loh AdrpAdd Lloh0, Lloh1 entry: diff --git a/llvm/test/CodeGen/AArch64/phi.ll b/llvm/test/CodeGen/AArch64/phi.ll index 02842c04bf7bf..446c3beeff976 100644 --- a/llvm/test/CodeGen/AArch64/phi.ll +++ b/llvm/test/CodeGen/AArch64/phi.ll @@ -131,8 +131,8 @@ define i128 @ti128(i1 %c, ptr %p, i128 %a, i128 %b) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: tbz w0, #0, .LBB4_2 ; CHECK-SD-NEXT: // %bb.1: // %t -; CHECK-SD-NEXT: mov x4, x2 ; CHECK-SD-NEXT: mov x5, x3 +; CHECK-SD-NEXT: mov x4, x2 ; CHECK-SD-NEXT: str wzr, [x1] ; CHECK-SD-NEXT: .LBB4_2: // %e ; CHECK-SD-NEXT: mov x0, x4 @@ -336,9 +336,9 @@ define <3 x i8> @tv3i8(i1 %c, ptr %p, <3 x i8> %a, <3 x i8> %b) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: tbz w0, #0, .LBB11_2 ; CHECK-SD-NEXT: // %bb.1: // %t -; CHECK-SD-NEXT: mov w5, w2 -; CHECK-SD-NEXT: mov w6, w3 ; CHECK-SD-NEXT: mov w7, w4 +; CHECK-SD-NEXT: mov w6, w3 +; CHECK-SD-NEXT: mov w5, w2 ; CHECK-SD-NEXT: str wzr, [x1] ; CHECK-SD-NEXT: .LBB11_2: // %e ; CHECK-SD-NEXT: mov w0, w5 @@ -454,8 +454,8 @@ define <32 x i8> @tv32i8(i1 %c, ptr %p, <32 x i8> %a, <32 x i8> %b) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: tbz w0, #0, .LBB15_2 ; CHECK-SD-NEXT: // %bb.1: // %t -; CHECK-SD-NEXT: mov v2.16b, v0.16b ; CHECK-SD-NEXT: mov v3.16b, v1.16b +; CHECK-SD-NEXT: mov v2.16b, v0.16b ; CHECK-SD-NEXT: str wzr, [x1] ; CHECK-SD-NEXT: .LBB15_2: // %e ; CHECK-SD-NEXT: mov v0.16b, v2.16b @@ -584,8 +584,8 @@ define <16 x i16> @tv16i16(i1 %c, ptr %p, <16 x i16> %a, <16 x i16> %b) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: tbz w0, #0, .LBB20_2 ; CHECK-SD-NEXT: // %bb.1: // %t -; CHECK-SD-NEXT: mov v2.16b, v0.16b ; CHECK-SD-NEXT: mov v3.16b, v1.16b +; CHECK-SD-NEXT: mov v2.16b, v0.16b ; CHECK-SD-NEXT: str wzr, [x1] ; CHECK-SD-NEXT: .LBB20_2: // %e ; CHECK-SD-NEXT: mov v0.16b, v2.16b @@ -679,8 +679,8 @@ define <8 x i32> @tv8i32(i1 %c, ptr %p, <8 x i32> %a, <8 x i32> %b) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: tbz w0, #0, .LBB24_2 ; CHECK-SD-NEXT: // %bb.1: // %t -; CHECK-SD-NEXT: mov v2.16b, v0.16b ; CHECK-SD-NEXT: mov v3.16b, v1.16b +; CHECK-SD-NEXT: mov v2.16b, v0.16b ; CHECK-SD-NEXT: str wzr, [x1] ; CHECK-SD-NEXT: .LBB24_2: // %e ; CHECK-SD-NEXT: mov v0.16b, v2.16b @@ -734,10 +734,10 @@ define <3 x i64> @tv3i64(i1 %c, ptr %p, <3 x i64> %a, <3 x i64> %b) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: tbz w0, #0, .LBB26_2 ; CHECK-SD-NEXT: // %bb.1: // %t -; CHECK-SD-NEXT: fmov d3, d0 +; CHECK-SD-NEXT: fmov d5, d2 ; CHECK-SD-NEXT: fmov d4, d1 ; CHECK-SD-NEXT: str wzr, [x1] -; CHECK-SD-NEXT: fmov d5, d2 +; CHECK-SD-NEXT: fmov d3, d0 ; CHECK-SD-NEXT: .LBB26_2: // %e ; CHECK-SD-NEXT: fmov d0, d3 ; CHECK-SD-NEXT: fmov d1, d4 @@ -783,8 +783,8 @@ define <4 x i64> @tv4i64(i1 %c, ptr %p, <4 x i64> %a, <4 x i64> %b) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: tbz w0, #0, .LBB27_2 ; CHECK-SD-NEXT: // %bb.1: // %t -; CHECK-SD-NEXT: mov v2.16b, v0.16b ; CHECK-SD-NEXT: mov v3.16b, v1.16b +; CHECK-SD-NEXT: mov v2.16b, v0.16b ; CHECK-SD-NEXT: str wzr, [x1] ; CHECK-SD-NEXT: .LBB27_2: // %e ; CHECK-SD-NEXT: mov v0.16b, v2.16b @@ -818,8 +818,8 @@ define <2 x i128> @tv2i128(i1 %c, ptr %p, <2 x i128> %a, <2 x i128> %b) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: tbz w0, #0, .LBB28_2 ; CHECK-SD-NEXT: // %bb.1: // %t -; CHECK-SD-NEXT: mov x6, x2 ; CHECK-SD-NEXT: mov x7, x3 +; CHECK-SD-NEXT: mov x6, x2 ; CHECK-SD-NEXT: str wzr, [x1] ; CHECK-SD-NEXT: b .LBB28_3 ; CHECK-SD-NEXT: .LBB28_2: @@ -883,10 +883,10 @@ define <3 x ptr> @tv3p0(i1 %c, ptr %p, <3 x ptr> %a, <3 x ptr> %b) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: tbz w0, #0, .LBB30_2 ; CHECK-SD-NEXT: // %bb.1: // %t -; CHECK-SD-NEXT: fmov d3, d0 +; CHECK-SD-NEXT: fmov d5, d2 ; CHECK-SD-NEXT: fmov d4, d1 ; CHECK-SD-NEXT: str wzr, [x1] -; CHECK-SD-NEXT: fmov d5, d2 +; CHECK-SD-NEXT: fmov d3, d0 ; CHECK-SD-NEXT: .LBB30_2: // %e ; CHECK-SD-NEXT: fmov d0, d3 ; CHECK-SD-NEXT: fmov d1, d4 @@ -932,8 +932,8 @@ define <4 x ptr> @tv4p0(i1 %c, ptr %p, <4 x ptr> %a, <4 x ptr> %b) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: tbz w0, #0, .LBB31_2 ; CHECK-SD-NEXT: // %bb.1: // %t -; CHECK-SD-NEXT: mov v2.16b, v0.16b ; CHECK-SD-NEXT: mov v3.16b, v1.16b +; CHECK-SD-NEXT: mov v2.16b, v0.16b ; CHECK-SD-NEXT: str wzr, [x1] ; CHECK-SD-NEXT: .LBB31_2: // %e ; CHECK-SD-NEXT: mov v0.16b, v2.16b @@ -1047,8 +1047,8 @@ define <16 x half> @tv16f16(i1 %c, ptr %p, <16 x half> %a, <16 x half> %b) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: tbz w0, #0, .LBB36_2 ; CHECK-SD-NEXT: // %bb.1: // %t -; CHECK-SD-NEXT: mov v2.16b, v0.16b ; CHECK-SD-NEXT: mov v3.16b, v1.16b +; CHECK-SD-NEXT: mov v2.16b, v0.16b ; CHECK-SD-NEXT: str wzr, [x1] ; CHECK-SD-NEXT: .LBB36_2: // %e ; CHECK-SD-NEXT: mov v0.16b, v2.16b @@ -1142,8 +1142,8 @@ define <8 x float> @tv8f32(i1 %c, ptr %p, <8 x float> %a, <8 x float> %b) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: tbz w0, #0, .LBB40_2 ; CHECK-SD-NEXT: // %bb.1: // %t -; CHECK-SD-NEXT: mov v2.16b, v0.16b ; CHECK-SD-NEXT: mov v3.16b, v1.16b +; CHECK-SD-NEXT: mov v2.16b, v0.16b ; CHECK-SD-NEXT: str wzr, [x1] ; CHECK-SD-NEXT: .LBB40_2: // %e ; CHECK-SD-NEXT: mov v0.16b, v2.16b @@ -1197,10 +1197,10 @@ define <3 x double> @tv3f64(i1 %c, ptr %p, <3 x double> %a, <3 x double> %b) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: tbz w0, #0, .LBB42_2 ; CHECK-SD-NEXT: // %bb.1: // %t -; CHECK-SD-NEXT: fmov d3, d0 +; CHECK-SD-NEXT: fmov d5, d2 ; CHECK-SD-NEXT: fmov d4, d1 ; CHECK-SD-NEXT: str wzr, [x1] -; CHECK-SD-NEXT: fmov d5, d2 +; CHECK-SD-NEXT: fmov d3, d0 ; CHECK-SD-NEXT: .LBB42_2: // %e ; CHECK-SD-NEXT: fmov d0, d3 ; CHECK-SD-NEXT: fmov d1, d4 @@ -1246,8 +1246,8 @@ define <4 x double> @tv4f64(i1 %c, ptr %p, <4 x double> %a, <4 x double> %b) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: tbz w0, #0, .LBB43_2 ; CHECK-SD-NEXT: // %bb.1: // %t -; CHECK-SD-NEXT: mov v2.16b, v0.16b ; CHECK-SD-NEXT: mov v3.16b, v1.16b +; CHECK-SD-NEXT: mov v2.16b, v0.16b ; CHECK-SD-NEXT: str wzr, [x1] ; CHECK-SD-NEXT: .LBB43_2: // %e ; CHECK-SD-NEXT: mov v0.16b, v2.16b @@ -1281,8 +1281,8 @@ define <2 x fp128> @tv2f128(i1 %c, ptr %p, <2 x fp128> %a, <2 x fp128> %b) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: tbz w0, #0, .LBB44_2 ; CHECK-SD-NEXT: // %bb.1: // %t -; CHECK-SD-NEXT: mov v2.16b, v0.16b ; CHECK-SD-NEXT: mov v3.16b, v1.16b +; CHECK-SD-NEXT: mov v2.16b, v0.16b ; CHECK-SD-NEXT: str wzr, [x1] ; CHECK-SD-NEXT: .LBB44_2: // %e ; CHECK-SD-NEXT: mov v0.16b, v2.16b @@ -1296,8 +1296,8 @@ define <2 x fp128> @tv2f128(i1 %c, ptr %p, <2 x fp128> %a, <2 x fp128> %b) { ; CHECK-GI-NEXT: mov d4, v1.d[1] ; CHECK-GI-NEXT: mov d5, v0.d[1] ; CHECK-GI-NEXT: str wzr, [x1] -; CHECK-GI-NEXT: fmov d2, d0 ; CHECK-GI-NEXT: fmov d3, d1 +; CHECK-GI-NEXT: fmov d2, d0 ; CHECK-GI-NEXT: b .LBB44_3 ; CHECK-GI-NEXT: .LBB44_2: ; CHECK-GI-NEXT: mov d4, v3.d[1] diff --git a/llvm/test/CodeGen/AArch64/pr48188.ll b/llvm/test/CodeGen/AArch64/pr48188.ll index d01069696572e..634517b099f98 100644 --- a/llvm/test/CodeGen/AArch64/pr48188.ll +++ b/llvm/test/CodeGen/AArch64/pr48188.ll @@ -14,17 +14,17 @@ define void @test() nounwind { ; GISEL-NEXT: b .LBB0_1 ; GISEL-NEXT: .LBB0_1: // %loop ; GISEL-NEXT: // =>This Inner Loop Header: Depth=1 -; GISEL-NEXT: ldr x8, [sp, #8] // 8-byte Folded Reload -; GISEL-NEXT: ldr x9, [sp] // 8-byte Folded Reload -; GISEL-NEXT: str x9, [sp] // 8-byte Folded Spill -; GISEL-NEXT: str x8, [sp, #8] // 8-byte Folded Spill +; GISEL-NEXT: ldr x8, [sp] // 8-byte Folded Reload +; GISEL-NEXT: ldr x9, [sp, #8] // 8-byte Folded Reload +; GISEL-NEXT: str x9, [sp, #8] // 8-byte Folded Spill +; GISEL-NEXT: str x8, [sp] // 8-byte Folded Spill ; GISEL-NEXT: b .LBB0_1 ; ; SDAG-LABEL: test: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: sub sp, sp, #16 -; SDAG-NEXT: mov x1, xzr -; SDAG-NEXT: mov x0, x1 +; SDAG-NEXT: mov x0, xzr +; SDAG-NEXT: mov x1, x0 ; SDAG-NEXT: str x1, [sp] // 8-byte Folded Spill ; SDAG-NEXT: str x0, [sp, #8] // 8-byte Folded Spill ; SDAG-NEXT: b .LBB0_1 diff --git a/llvm/test/CodeGen/AArch64/ragreedy-csr.ll b/llvm/test/CodeGen/AArch64/ragreedy-csr.ll index 5b501762418ef..921cadc7a7511 100644 --- a/llvm/test/CodeGen/AArch64/ragreedy-csr.ll +++ b/llvm/test/CodeGen/AArch64/ragreedy-csr.ll @@ -211,27 +211,27 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly ; CHECK-NEXT: cmp w12, #2 ; CHECK-NEXT: b.ne LBB0_43 ; CHECK-NEXT: ; %bb.35: ; %while.cond130.preheader -; CHECK-NEXT: ldrb w8, [x9, x11] -; CHECK-NEXT: cbz w8, LBB0_23 +; CHECK-NEXT: ldrb w12, [x9, x11] +; CHECK-NEXT: cbz w12, LBB0_23 ; CHECK-NEXT: ; %bb.36: ; %land.rhs134.preheader -; CHECK-NEXT: mov x12, xzr +; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: mov w0, #1 ; =0x1 ; CHECK-NEXT: b LBB0_38 ; CHECK-NEXT: LBB0_37: ; %if.then152 ; CHECK-NEXT: ; in Loop: Header=BB0_38 Depth=1 -; CHECK-NEXT: add x8, x9, x12 -; CHECK-NEXT: add x12, x12, #1 -; CHECK-NEXT: add x8, x8, x11 -; CHECK-NEXT: ldrb w8, [x8, #1] -; CHECK-NEXT: cbz w8, LBB0_43 +; CHECK-NEXT: add x12, x9, x8 +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: add x12, x12, x11 +; CHECK-NEXT: ldrb w12, [x12, #1] +; CHECK-NEXT: cbz w12, LBB0_43 ; CHECK-NEXT: LBB0_38: ; %land.rhs134 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add x13, x10, x12 +; CHECK-NEXT: add x13, x10, x8 ; CHECK-NEXT: ldrb w13, [x13, x11] ; CHECK-NEXT: cbz w13, LBB0_23 ; CHECK-NEXT: ; %bb.39: ; %while.body139 ; CHECK-NEXT: ; in Loop: Header=BB0_38 Depth=1 -; CHECK-NEXT: cmp w8, w13 +; CHECK-NEXT: cmp w12, w13 ; CHECK-NEXT: b.eq LBB0_37 ; CHECK-NEXT: ; %bb.40: ; %while.body139 ; CHECK-NEXT: ; in Loop: Header=BB0_38 Depth=1 @@ -239,7 +239,7 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly ; CHECK-NEXT: b.eq LBB0_37 ; CHECK-NEXT: ; %bb.41: ; %while.body139 ; CHECK-NEXT: ; in Loop: Header=BB0_38 Depth=1 -; CHECK-NEXT: cmp w8, #94 +; CHECK-NEXT: cmp w12, #94 ; CHECK-NEXT: b.eq LBB0_37 ; CHECK-NEXT: LBB0_42: ; CHECK-NEXT: mov w0, wzr diff --git a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll index c91de8f3a0a47..8f1c504a7f684 100644 --- a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll +++ b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll @@ -31,14 +31,12 @@ define dso_local void @run_test() local_unnamed_addr uwtable { ; CHECK-NEXT: .cfi_offset b14, -104 ; CHECK-NEXT: .cfi_offset b15, -112 ; CHECK-NEXT: movi v2.2d, #0000000000000000 -; CHECK-NEXT: // implicit-def: $q1 ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: adrp x9, B+48 ; CHECK-NEXT: add x9, x9, :lo12:B+48 ; CHECK-NEXT: adrp x10, A ; CHECK-NEXT: add x10, x10, :lo12:A ; CHECK-NEXT: mov x11, xzr -; CHECK-NEXT: // kill: killed $q1 ; CHECK-NEXT: // implicit-def: $q1 ; CHECK-NEXT: mov x12, xzr ; CHECK-NEXT: // implicit-def: $q0 @@ -56,7 +54,7 @@ define dso_local void @run_test() local_unnamed_addr uwtable { ; CHECK-NEXT: // implicit-def: $q22 ; CHECK-NEXT: // implicit-def: $q23 ; CHECK-NEXT: // implicit-def: $q24 -; CHECK-NEXT: // implicit-def: $q9 +; CHECK-NEXT: // implicit-def: $q25 ; CHECK-NEXT: // implicit-def: $q27 ; CHECK-NEXT: // implicit-def: $q12 ; CHECK-NEXT: // implicit-def: $q28 @@ -66,95 +64,97 @@ define dso_local void @run_test() local_unnamed_addr uwtable { ; CHECK-NEXT: // implicit-def: $q30 ; CHECK-NEXT: // implicit-def: $q11 ; CHECK-NEXT: // implicit-def: $q31 -; CHECK-NEXT: // implicit-def: $q13 ; CHECK-NEXT: // kill: killed $q1 +; CHECK-NEXT: // implicit-def: $q9 +; CHECK-NEXT: // implicit-def: $q13 ; CHECK-NEXT: // implicit-def: $q1 ; CHECK-NEXT: // kill: killed $q1 ; CHECK-NEXT: .LBB0_1: // %for.cond1.preheader ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: stp q29, q15, [sp] // 32-byte Folded Spill -; CHECK-NEXT: ldr q15, [x8] ; CHECK-NEXT: ldr x15, [x8] -; CHECK-NEXT: str q14, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: add x20, x10, x11 -; CHECK-NEXT: mov v8.16b, v28.16b -; CHECK-NEXT: fmov x2, d15 -; CHECK-NEXT: mov x17, v15.d[1] +; CHECK-NEXT: ldr x6, [x8] +; CHECK-NEXT: ldr x20, [x20, #128] +; CHECK-NEXT: stp q25, q29, [sp] // 32-byte Folded Spill +; CHECK-NEXT: mov v29.16b, v21.16b +; CHECK-NEXT: stp q15, q14, [sp, #32] // 32-byte Folded Spill ; CHECK-NEXT: ldr q14, [x8] +; CHECK-NEXT: mov v21.16b, v0.16b +; CHECK-NEXT: ldr q15, [x8] +; CHECK-NEXT: mov v8.16b, v28.16b ; CHECK-NEXT: mov v28.16b, v24.16b -; CHECK-NEXT: mov v24.16b, v20.16b -; CHECK-NEXT: mov v20.16b, v17.16b ; CHECK-NEXT: fmov x13, d14 ; CHECK-NEXT: mov x16, v14.d[1] -; CHECK-NEXT: mov v17.16b, v5.16b -; CHECK-NEXT: mul x3, x2, x15 +; CHECK-NEXT: mov v24.16b, v20.16b +; CHECK-NEXT: fmov x2, d15 +; CHECK-NEXT: mov x17, v15.d[1] +; CHECK-NEXT: mov v20.16b, v17.16b ; CHECK-NEXT: ldr q14, [x9], #64 +; CHECK-NEXT: mov v17.16b, v5.16b +; CHECK-NEXT: mul x18, x13, x15 ; CHECK-NEXT: ldr q5, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x6, [x8] -; CHECK-NEXT: ldr x20, [x20, #128] -; CHECK-NEXT: mul x1, x17, x15 +; CHECK-NEXT: mov v25.16b, v6.16b ; CHECK-NEXT: mov x14, v14.d[1] ; CHECK-NEXT: fmov x5, d14 -; CHECK-NEXT: mov v29.16b, v21.16b -; CHECK-NEXT: mov v21.16b, v0.16b -; CHECK-NEXT: mov v25.16b, v6.16b -; CHECK-NEXT: mul x18, x13, x15 ; CHECK-NEXT: mov v6.16b, v2.16b +; CHECK-NEXT: mul x3, x2, x15 ; CHECK-NEXT: mov v26.16b, v22.16b -; CHECK-NEXT: fmov d15, x3 ; CHECK-NEXT: mov v22.16b, v18.16b ; CHECK-NEXT: mov v18.16b, v7.16b -; CHECK-NEXT: mul x0, x16, x15 ; CHECK-NEXT: mov v7.16b, v3.16b ; CHECK-NEXT: mov v16.16b, v4.16b +; CHECK-NEXT: mul x0, x16, x15 ; CHECK-NEXT: add x11, x11, #8 ; CHECK-NEXT: add x12, x12, #1 -; CHECK-NEXT: mov v15.d[1], x1 -; CHECK-NEXT: mul x4, x14, x15 -; CHECK-NEXT: cmp x11, #64 ; CHECK-NEXT: fmov d14, x18 +; CHECK-NEXT: cmp x11, #64 +; CHECK-NEXT: mul x1, x17, x15 +; CHECK-NEXT: fmov d15, x3 +; CHECK-NEXT: mul x4, x14, x15 +; CHECK-NEXT: mov v14.d[1], x0 ; CHECK-NEXT: mul x15, x5, x15 -; CHECK-NEXT: add v5.2d, v5.2d, v15.2d +; CHECK-NEXT: mov v15.d[1], x1 ; CHECK-NEXT: mul x21, x2, x6 -; CHECK-NEXT: mov v14.d[1], x0 +; CHECK-NEXT: add v5.2d, v5.2d, v14.2d +; CHECK-NEXT: add v9.2d, v9.2d, v14.2d ; CHECK-NEXT: mul x2, x2, x20 ; CHECK-NEXT: fmov d0, x15 -; CHECK-NEXT: str q5, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: ldr q5, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: add v13.2d, v13.2d, v15.2d ; CHECK-NEXT: mul x22, x13, x20 -; CHECK-NEXT: add v5.2d, v5.2d, v14.2d +; CHECK-NEXT: str q5, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: fmov d3, x21 +; CHECK-NEXT: ldp q15, q14, [sp, #32] // 32-byte Folded Reload ; CHECK-NEXT: mul x19, x17, x6 ; CHECK-NEXT: mov v0.d[1], x4 +; CHECK-NEXT: mov v5.16b, v13.16b ; CHECK-NEXT: fmov d1, x2 +; CHECK-NEXT: mov v13.16b, v9.16b +; CHECK-NEXT: ldr q9, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: mul x17, x17, x20 -; CHECK-NEXT: str q5, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: add v5.2d, v13.2d, v14.2d ; CHECK-NEXT: fmov d2, x22 -; CHECK-NEXT: ldr q13, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: mul x7, x16, x6 -; CHECK-NEXT: ldp q15, q14, [sp, #16] // 32-byte Folded Reload +; CHECK-NEXT: add v9.2d, v9.2d, v0.2d ; CHECK-NEXT: mov v3.d[1], x19 -; CHECK-NEXT: add v13.2d, v13.2d, v0.2d ; CHECK-NEXT: mul x16, x16, x20 ; CHECK-NEXT: mov v1.d[1], x17 +; CHECK-NEXT: str q9, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov v9.16b, v13.16b ; CHECK-NEXT: mul x23, x5, x20 -; CHECK-NEXT: str q13, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: mov v13.16b, v5.16b ; CHECK-NEXT: mov v5.16b, v17.16b ; CHECK-NEXT: mov v17.16b, v20.16b ; CHECK-NEXT: mov v20.16b, v24.16b -; CHECK-NEXT: mul x13, x13, x6 ; CHECK-NEXT: mov v24.16b, v28.16b +; CHECK-NEXT: mul x13, x13, x6 ; CHECK-NEXT: add v11.2d, v11.2d, v3.2d +; CHECK-NEXT: add v27.2d, v27.2d, v3.2d ; CHECK-NEXT: mov v2.d[1], x16 ; CHECK-NEXT: add v15.2d, v15.2d, v1.2d -; CHECK-NEXT: add v27.2d, v27.2d, v3.2d -; CHECK-NEXT: mul x18, x14, x20 ; CHECK-NEXT: add v23.2d, v23.2d, v3.2d +; CHECK-NEXT: mul x18, x14, x20 ; CHECK-NEXT: add v19.2d, v19.2d, v3.2d -; CHECK-NEXT: fmov d4, x23 ; CHECK-NEXT: add v10.2d, v10.2d, v3.2d +; CHECK-NEXT: fmov d4, x23 ; CHECK-NEXT: mul x15, x5, x6 ; CHECK-NEXT: fmov d0, x13 ; CHECK-NEXT: add v14.2d, v14.2d, v2.2d @@ -164,6 +164,7 @@ define dso_local void @run_test() local_unnamed_addr uwtable { ; CHECK-NEXT: mov v7.16b, v18.16b ; CHECK-NEXT: mov v4.d[1], x18 ; CHECK-NEXT: mov v18.16b, v22.16b +; CHECK-NEXT: mov v6.16b, v25.16b ; CHECK-NEXT: mov v0.d[1], x7 ; CHECK-NEXT: fmov d1, x15 ; CHECK-NEXT: add v28.2d, v8.2d, v4.2d @@ -181,38 +182,36 @@ define dso_local void @run_test() local_unnamed_addr uwtable { ; CHECK-NEXT: add v3.2d, v3.2d, v0.2d ; CHECK-NEXT: mov v0.16b, v21.16b ; CHECK-NEXT: mov v21.16b, v29.16b -; CHECK-NEXT: ldr q29, [sp] // 16-byte Folded Reload -; CHECK-NEXT: add v9.2d, v9.2d, v1.2d -; CHECK-NEXT: add v6.2d, v25.2d, v1.2d +; CHECK-NEXT: add v6.2d, v6.2d, v1.2d +; CHECK-NEXT: ldp q25, q29, [sp] // 32-byte Folded Reload ; CHECK-NEXT: add v5.2d, v5.2d, v1.2d -; CHECK-NEXT: add v29.2d, v29.2d, v1.2d ; CHECK-NEXT: add v21.2d, v21.2d, v1.2d ; CHECK-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-NEXT: add v29.2d, v29.2d, v1.2d +; CHECK-NEXT: add v25.2d, v25.2d, v1.2d ; CHECK-NEXT: b.ne .LBB0_1 ; CHECK-NEXT: // %bb.2: // %for.cond.cleanup -; CHECK-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: adrp x8, C ; CHECK-NEXT: add x8, x8, :lo12:C -; CHECK-NEXT: stp q11, q30, [x8, #80] +; CHECK-NEXT: stp q31, q11, [x8, #64] ; CHECK-NEXT: ldp x20, x19, [sp, #192] // 16-byte Folded Reload -; CHECK-NEXT: str q1, [x8] -; CHECK-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: stp q1, q13, [x8] +; CHECK-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldr x23, [sp, #160] // 8-byte Folded Reload -; CHECK-NEXT: stp q15, q14, [x8, #144] +; CHECK-NEXT: stp q30, q29, [x8, #96] ; CHECK-NEXT: ldp x22, x21, [sp, #176] // 16-byte Folded Reload -; CHECK-NEXT: stp q1, q13, [x8, #16] -; CHECK-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: stp q9, q1, [x8, #32] +; CHECK-NEXT: ldp d9, d8, [sp, #144] // 16-byte Folded Reload +; CHECK-NEXT: stp q15, q14, [x8, #144] +; CHECK-NEXT: ldp d15, d14, [sp, #96] // 16-byte Folded Reload ; CHECK-NEXT: stp q28, q12, [x8, #176] ; CHECK-NEXT: ldp d13, d12, [sp, #112] // 16-byte Folded Reload -; CHECK-NEXT: stp q1, q31, [x8, #48] -; CHECK-NEXT: ldp d15, d14, [sp, #96] // 16-byte Folded Reload -; CHECK-NEXT: stp q9, q24, [x8, #240] -; CHECK-NEXT: ldp d9, d8, [sp, #144] // 16-byte Folded Reload ; CHECK-NEXT: stp q19, q18, [x8, #336] ; CHECK-NEXT: stp q10, q7, [x8, #400] ; CHECK-NEXT: ldp d11, d10, [sp, #128] // 16-byte Folded Reload -; CHECK-NEXT: str q29, [x8, #112] ; CHECK-NEXT: str q27, [x8, #208] +; CHECK-NEXT: stp q25, q24, [x8, #240] ; CHECK-NEXT: stp q23, q22, [x8, #272] ; CHECK-NEXT: stp q21, q20, [x8, #304] ; CHECK-NEXT: stp q6, q17, [x8, #368] diff --git a/llvm/test/CodeGen/AArch64/reduce-or-opt.ll b/llvm/test/CodeGen/AArch64/reduce-or-opt.ll index f5df5ea53c990..b3dfab8f69b59 100644 --- a/llvm/test/CodeGen/AArch64/reduce-or-opt.ll +++ b/llvm/test/CodeGen/AArch64/reduce-or-opt.ll @@ -95,18 +95,18 @@ define i64 @select_or_reduce_nxv2i1(ptr nocapture noundef readonly %src) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cntd x8 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov x9, xzr -; CHECK-NEXT: neg x10, x8 -; CHECK-NEXT: add x10, x10, #4 +; CHECK-NEXT: mov x10, xzr +; CHECK-NEXT: neg x9, x8 +; CHECK-NEXT: add x9, x9, #4 ; CHECK-NEXT: .LBB2_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x10, lsl #3] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; CHECK-NEXT: b.ne .LBB2_3 ; CHECK-NEXT: // %bb.2: // %vector.body ; CHECK-NEXT: // in Loop: Header=BB2_1 Depth=1 -; CHECK-NEXT: cmp x10, x9 -; CHECK-NEXT: add x9, x9, x8 +; CHECK-NEXT: cmp x9, x10 +; CHECK-NEXT: add x10, x10, x8 ; CHECK-NEXT: b.ne .LBB2_1 ; CHECK-NEXT: .LBB2_3: // %middle.split ; CHECK-NEXT: ptest p0, p1.b @@ -138,18 +138,18 @@ define i64 @br_or_reduce_nxv2i1(ptr nocapture noundef readonly %src, ptr noundef ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cntd x8 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov x9, xzr -; CHECK-NEXT: neg x10, x8 -; CHECK-NEXT: add x10, x10, #4 +; CHECK-NEXT: mov x10, xzr +; CHECK-NEXT: neg x9, x8 +; CHECK-NEXT: add x9, x9, #4 ; CHECK-NEXT: .LBB3_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x10, lsl #3] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; CHECK-NEXT: b.ne .LBB3_3 ; CHECK-NEXT: // %bb.2: // %vector.body ; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1 -; CHECK-NEXT: cmp x10, x9 -; CHECK-NEXT: add x9, x9, x8 +; CHECK-NEXT: cmp x9, x10 +; CHECK-NEXT: add x10, x10, x8 ; CHECK-NEXT: b.ne .LBB3_1 ; CHECK-NEXT: .LBB3_3: // %middle.split ; CHECK-NEXT: ptest p0, p1.b diff --git a/llvm/test/CodeGen/AArch64/sink-and-fold.ll b/llvm/test/CodeGen/AArch64/sink-and-fold.ll index 4d383fefc43c7..163124c0d2757 100644 --- a/llvm/test/CodeGen/AArch64/sink-and-fold.ll +++ b/llvm/test/CodeGen/AArch64/sink-and-fold.ll @@ -151,7 +151,7 @@ define void @f4(ptr %a, i64 %n) nounwind "target-features"="+alu-lsl-fast" { ; CHECK-NEXT: .LBB4_3: // %LI ; CHECK-NEXT: // =>This Loop Header: Depth=1 ; CHECK-NEXT: // Child Loop BB4_6 Depth 2 -; CHECK-NEXT: mov x21, xzr +; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: add x23, x22, #1 ; CHECK-NEXT: b .LBB4_6 ; CHECK-NEXT: .LBB4_4: // %if.else @@ -162,13 +162,13 @@ define void @f4(ptr %a, i64 %n) nounwind "target-features"="+alu-lsl-fast" { ; CHECK-NEXT: add x8, x21, #1 ; CHECK-NEXT: str w0, [x20, x21, lsl #2] ; CHECK-NEXT: sub x9, x8, #1 -; CHECK-NEXT: mov x21, x8 ; CHECK-NEXT: cmp x9, x19 ; CHECK-NEXT: b.ge .LBB4_2 ; CHECK-NEXT: .LBB4_6: // %LJ ; CHECK-NEXT: // Parent Loop BB4_3 Depth=1 ; CHECK-NEXT: // => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldr w8, [x20, x21, lsl #2] +; CHECK-NEXT: mov x21, x8 +; CHECK-NEXT: ldr w8, [x20, x8, lsl #2] ; CHECK-NEXT: tbz w8, #31, .LBB4_4 ; CHECK-NEXT: // %bb.7: // %if.then ; CHECK-NEXT: // in Loop: Header=BB4_6 Depth=2 diff --git a/llvm/test/CodeGen/AArch64/sve-lsrchain.ll b/llvm/test/CodeGen/AArch64/sve-lsrchain.ll index d94fa6433bb7f..2fe3001ec0f44 100644 --- a/llvm/test/CodeGen/AArch64/sve-lsrchain.ll +++ b/llvm/test/CodeGen/AArch64/sve-lsrchain.ll @@ -14,22 +14,22 @@ define void @test(ptr nocapture noundef readonly %kernel, i32 noundef %kw, float ; CHECK-NEXT: // %bb.2: // %for.body.us.preheader ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add x11, x2, x11, lsl #1 -; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: mov x9, xzr +; CHECK-NEXT: mov w9, wzr ; CHECK-NEXT: mov w10, wzr ; CHECK-NEXT: mov x12, #4 // =0x4 ; CHECK-NEXT: mov x13, #8 // =0x8 ; CHECK-NEXT: .LBB0_3: // %for.body.us ; CHECK-NEXT: // =>This Loop Header: Depth=1 ; CHECK-NEXT: // Child Loop BB0_4 Depth 2 -; CHECK-NEXT: add x14, x0, x9, lsl #2 -; CHECK-NEXT: sbfiz x15, x8, #1, #32 +; CHECK-NEXT: add x14, x0, x8, lsl #2 +; CHECK-NEXT: sbfiz x15, x9, #1, #32 ; CHECK-NEXT: mov x16, x2 ; CHECK-NEXT: ldp s0, s1, [x14] ; CHECK-NEXT: add x15, x15, #8 ; CHECK-NEXT: ldp s2, s3, [x14, #8] -; CHECK-NEXT: ubfiz x14, x8, #1, #32 +; CHECK-NEXT: ubfiz x14, x9, #1, #32 ; CHECK-NEXT: fcvt h0, s0 ; CHECK-NEXT: fcvt h1, s1 ; CHECK-NEXT: fcvt h2, s2 @@ -91,8 +91,8 @@ define void @test(ptr nocapture noundef readonly %kernel, i32 noundef %kw, float ; CHECK-NEXT: // %bb.5: // %while.cond.i..exit_crit_edge.us ; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: add w10, w10, #1 -; CHECK-NEXT: add x9, x9, #4 -; CHECK-NEXT: add w8, w8, #16 +; CHECK-NEXT: add x8, x8, #4 +; CHECK-NEXT: add w9, w9, #16 ; CHECK-NEXT: cmp w10, w1 ; CHECK-NEXT: b.ne .LBB0_3 ; CHECK-NEXT: .LBB0_6: // %exit78 diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll b/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll index 124f81e7864d1..39fe92aae0619 100644 --- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll +++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll @@ -11,12 +11,12 @@ define void @test_sink_ptrue_into_ptest(i32 %n) { ; CHECK-NEXT: whilelt p0.s, wzr, w0 ; CHECK-NEXT: b.pl .LBB0_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: cntw x9 +; CHECK-NEXT: mov w9, wzr +; CHECK-NEXT: cntw x8 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: whilelt p0.s, w8, w0 -; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: whilelt p0.s, w9, w0 +; CHECK-NEXT: add w9, w9, w8 ; CHECK-NEXT: b.mi .LBB0_2 ; CHECK-NEXT: .LBB0_3: // %exit ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/swifterror.ll b/llvm/test/CodeGen/AArch64/swifterror.ll index 07ee87e880aff..9949a48034815 100644 --- a/llvm/test/CodeGen/AArch64/swifterror.ll +++ b/llvm/test/CodeGen/AArch64/swifterror.ll @@ -564,10 +564,11 @@ define float @foo_loop(ptr swifterror %error_ptr_ref, i32 %cc, float %cc2) { ; CHECK-O0-AARCH64-NEXT: mov w8, #16 ; =0x10 ; CHECK-O0-AARCH64-NEXT: mov w0, w8 ; CHECK-O0-AARCH64-NEXT: bl _malloc -; CHECK-O0-AARCH64-NEXT: mov x9, x0 +; CHECK-O0-AARCH64-NEXT: mov x1, x0 +; CHECK-O0-AARCH64-NEXT: mov x0, x1 +; CHECK-O0-AARCH64-NEXT: str x1, [sp, #8] ; 8-byte Folded Spill ; CHECK-O0-AARCH64-NEXT: mov w8, #1 ; =0x1 -; CHECK-O0-AARCH64-NEXT: strb w8, [x9, #8] -; CHECK-O0-AARCH64-NEXT: str x0, [sp, #8] ; 8-byte Folded Spill +; CHECK-O0-AARCH64-NEXT: strb w8, [x0, #8] ; CHECK-O0-AARCH64-NEXT: LBB4_3: ; %bb_cont ; CHECK-O0-AARCH64-NEXT: ; in Loop: Header=BB4_1 Depth=1 ; CHECK-O0-AARCH64-NEXT: ldr s0, [sp, #16] ; 4-byte Folded Reload @@ -605,12 +606,11 @@ define float @foo_loop(ptr swifterror %error_ptr_ref, i32 %cc, float %cc2) { ; CHECK-O0-ARM64_32-NEXT: mov w8, #16 ; =0x10 ; CHECK-O0-ARM64_32-NEXT: mov w0, w8 ; CHECK-O0-ARM64_32-NEXT: bl _malloc -; CHECK-O0-ARM64_32-NEXT: mov x9, x0 -; CHECK-O0-ARM64_32-NEXT: ; kill: def $x0 killed $x9 -; CHECK-O0-ARM64_32-NEXT: mov x0, x9 +; CHECK-O0-ARM64_32-NEXT: ; kill: def $x1 killed $x0 +; CHECK-O0-ARM64_32-NEXT: mov x1, x0 +; CHECK-O0-ARM64_32-NEXT: str x1, [sp, #8] ; 8-byte Folded Spill ; CHECK-O0-ARM64_32-NEXT: mov w8, #1 ; =0x1 -; CHECK-O0-ARM64_32-NEXT: strb w8, [x9, #8] -; CHECK-O0-ARM64_32-NEXT: str x0, [sp, #8] ; 8-byte Folded Spill +; CHECK-O0-ARM64_32-NEXT: strb w8, [x0, #8] ; CHECK-O0-ARM64_32-NEXT: LBB4_3: ; %bb_cont ; CHECK-O0-ARM64_32-NEXT: ; in Loop: Header=BB4_1 Depth=1 ; CHECK-O0-ARM64_32-NEXT: ldr s0, [sp, #16] ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll index 666523c88860c..11afcebfada2e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll @@ -1670,23 +1670,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, s16 -; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f32_e32 v3, v0, v0 +; GFX942-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX942-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB13_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1718,22 +1718,22 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s20 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v1, s20 +; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1743,23 +1743,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, s20 -; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v3, s20 +; GFX908-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v3, v0, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX908-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f32_e32 v1, v5, v5 +; GFX908-NEXT: v_max_f32_e32 v4, v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1769,23 +1769,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v3, s20 +; GFX8-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX8-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX8-NEXT: v_max_f32_e32 v4, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1986,23 +1986,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, s16 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen ; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[9:10], v[9:10] +; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[2:3], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: v_dual_mov_b32 v4, v9 :: v_dual_mov_b32 v5, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[2:5], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2026,24 +2027,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, s16 -; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], 0 offen ; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[9:10], v[9:10] +; GFX11-NEXT: v_max_f64 v[7:8], v[2:3], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX11-NEXT: v_dual_mov_b32 v4, v9 :: v_dual_mov_b32 v5, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[2:5], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2077,24 +2079,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, s20 ; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen -; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v10, v3 ; GFX908-NEXT: v_mov_b32_e32 v9, v2 -; GFX908-NEXT: v_mov_b32_e32 v8, v1 -; GFX908-NEXT: v_mov_b32_e32 v7, v0 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f64 v[2:3], v[9:10], v[9:10] +; GFX908-NEXT: v_max_f64 v[7:8], v[2:3], v[0:1] +; GFX908-NEXT: v_mov_b32_e32 v2, v7 +; GFX908-NEXT: v_mov_b32_e32 v3, v8 +; GFX908-NEXT: v_mov_b32_e32 v4, v9 +; GFX908-NEXT: v_mov_b32_e32 v5, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v2, v7 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v8 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2106,24 +2108,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, s20 ; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen -; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v10, v3 ; GFX8-NEXT: v_mov_b32_e32 v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, v0 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_max_f64 v[2:3], v[9:10], v[9:10] +; GFX8-NEXT: v_max_f64 v[7:8], v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v7 +; GFX8-NEXT: v_mov_b32_e32 v3, v8 +; GFX8-NEXT: v_mov_b32_e32 v4, v9 +; GFX8-NEXT: v_mov_b32_e32 v5, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v7 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v8 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll index 351502816ae6e..404da6a8a1ef7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll @@ -1670,23 +1670,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, s16 -; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f32_e32 v3, v0, v0 +; GFX942-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX942-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v2, v0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB13_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1718,22 +1718,22 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s20 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v1, s20 +; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v2, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1743,23 +1743,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, s20 -; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v3, s20 +; GFX908-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v3, v0, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX908-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f32_e32 v1, v5, v5 +; GFX908-NEXT: v_min_f32_e32 v4, v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1769,23 +1769,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v3, s20 +; GFX8-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX8-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX8-NEXT: v_min_f32_e32 v4, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1986,23 +1986,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, s16 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen ; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[9:10], v[9:10] +; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[2:3], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: v_dual_mov_b32 v4, v9 :: v_dual_mov_b32 v5, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[2:5], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2026,24 +2027,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, s16 -; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], 0 offen ; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[9:10], v[9:10] +; GFX11-NEXT: v_min_f64 v[7:8], v[2:3], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX11-NEXT: v_dual_mov_b32 v4, v9 :: v_dual_mov_b32 v5, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[2:5], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2077,24 +2079,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, s20 ; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen -; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v10, v3 ; GFX908-NEXT: v_mov_b32_e32 v9, v2 -; GFX908-NEXT: v_mov_b32_e32 v8, v1 -; GFX908-NEXT: v_mov_b32_e32 v7, v0 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f64 v[2:3], v[9:10], v[9:10] +; GFX908-NEXT: v_min_f64 v[7:8], v[2:3], v[0:1] +; GFX908-NEXT: v_mov_b32_e32 v2, v7 +; GFX908-NEXT: v_mov_b32_e32 v3, v8 +; GFX908-NEXT: v_mov_b32_e32 v4, v9 +; GFX908-NEXT: v_mov_b32_e32 v5, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v2, v7 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v8 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2106,24 +2108,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, s20 ; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen -; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v10, v3 ; GFX8-NEXT: v_mov_b32_e32 v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, v0 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_max_f64 v[2:3], v[9:10], v[9:10] +; GFX8-NEXT: v_min_f64 v[7:8], v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v7 +; GFX8-NEXT: v_mov_b32_e32 v3, v8 +; GFX8-NEXT: v_mov_b32_e32 v4, v9 +; GFX8-NEXT: v_mov_b32_e32 v5, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v7 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v8 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll index d13d6a19d332a..14e000387f7eb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll @@ -387,11 +387,11 @@ define void @nested_loops_temporal_divergence_both(float %pre.cond.val, i32 %n.i ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB5_2 Depth 2 ; GFX10-NEXT: s_ashr_i32 s7, s6, 31 -; GFX10-NEXT: s_mov_b32 s4, s8 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], 2 ; GFX10-NEXT: v_mov_b32_e32 v8, s10 ; GFX10-NEXT: v_mov_b32_e32 v9, s11 -; GFX10-NEXT: s_mov_b32 s10, 0 +; GFX10-NEXT: s_mov_b32 s10, s8 ; GFX10-NEXT: s_mov_b32 s11, 0 ; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v2, v8 ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v3, v9, vcc_lo @@ -402,18 +402,18 @@ define void @nested_loops_temporal_divergence_both(float %pre.cond.val, i32 %n.i ; GFX10-NEXT: v_cvt_f32_u32_e32 v8, s11 ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_add_i32 s11, s11, 1 -; GFX10-NEXT: s_xor_b32 s4, s4, s12 +; GFX10-NEXT: s_xor_b32 s10, s10, s12 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v8, v0 -; GFX10-NEXT: s_or_b32 s10, vcc_lo, s10 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 s9, s9, exec_lo -; GFX10-NEXT: s_and_b32 s12, exec_lo, s4 +; GFX10-NEXT: s_and_b32 s12, exec_lo, s10 ; GFX10-NEXT: s_or_b32 s9, s9, s12 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s10 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB5_2 ; GFX10-NEXT: ; %bb.3: ; %UseInst ; GFX10-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s10 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v9, s7 ; GFX10-NEXT: v_mov_b32_e32 v8, s6 ; GFX10-NEXT: v_cmp_lt_u32_e32 vcc_lo, s6, v1 diff --git a/llvm/test/CodeGen/AMDGPU/add_i1.ll b/llvm/test/CodeGen/AMDGPU/add_i1.ll index ca605986da941..2c584e5aa5437 100644 --- a/llvm/test/CodeGen/AMDGPU/add_i1.ll +++ b/llvm/test/CodeGen/AMDGPU/add_i1.ll @@ -111,25 +111,25 @@ define amdgpu_kernel void @add_i1_cf(ptr addrspace(1) %out, ptr addrspace(1) %a, ; GFX9-LABEL: add_i1_cf: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 ; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX9-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX9-NEXT: s_xor_b64 s[8:9], exec, s[8:9] ; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %else ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v0, s[8:9] glc +; GFX9-NEXT: global_load_ubyte v0, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 ; GFX9-NEXT: .LBB2_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_andn2_saveexec_b64 s[6:7], s[8:9] ; GFX9-NEXT: s_cbranch_execz .LBB2_4 ; GFX9-NEXT: ; %bb.3: ; %if ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_andn2_b64 s[2:3], s[4:5], exec @@ -139,7 +139,6 @@ define amdgpu_kernel void @add_i1_cf(ptr addrspace(1) %out, ptr addrspace(1) %a, ; GFX9-NEXT: s_or_b64 s[4:5], s[2:3], s[4:5] ; GFX9-NEXT: .LBB2_4: ; %endif ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_xor_b64 s[2:3], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index 0d5f538215f18..5a6181c3d32a6 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -10470,7 +10470,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: v_readfirstlane_b32 s5, v14 ; GFX11-NEXT: v_writelane_b32 v16, s37, 5 ; GFX11-NEXT: v_writelane_b32 v17, s101, 5 -; GFX11-NEXT: s_mov_b32 s101, 0 +; GFX11-NEXT: s_mov_b32 s44, 0 ; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo ; GFX11-NEXT: ; implicit-def: $vgpr19 : SGPR spill to VGPR lane ; GFX11-NEXT: ; implicit-def: $vgpr18 : SGPR spill to VGPR lane @@ -10503,676 +10503,497 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: v_writelane_b32 v16, s85, 29 ; GFX11-NEXT: v_writelane_b32 v16, s86, 30 ; GFX11-NEXT: v_writelane_b32 v16, s87, 31 -; GFX11-NEXT: s_cbranch_scc0 .LBB13_2 +; GFX11-NEXT: s_cbranch_scc0 .LBB13_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s43, s25, 8 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 -; GFX11-NEXT: v_writelane_b32 v19, s43, 16 -; GFX11-NEXT: s_lshr_b32 s43, s24, 16 -; GFX11-NEXT: s_lshr_b32 s104, s5, 24 -; GFX11-NEXT: s_lshr_b32 s102, s5, 16 -; GFX11-NEXT: s_lshr_b32 s103, s5, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 17 -; GFX11-NEXT: s_lshr_b32 s43, s24, 8 -; GFX11-NEXT: s_lshr_b32 s57, s4, 16 -; GFX11-NEXT: s_lshr_b32 s47, s4, 8 -; GFX11-NEXT: s_lshr_b32 s46, s7, 24 -; GFX11-NEXT: v_writelane_b32 v19, s43, 18 -; GFX11-NEXT: s_lshr_b32 s43, s23, 24 -; GFX11-NEXT: s_lshr_b32 vcc_hi, s7, 16 -; GFX11-NEXT: s_lshr_b32 s34, s7, 8 -; GFX11-NEXT: s_lshr_b32 s69, s6, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 19 -; GFX11-NEXT: s_lshr_b32 s43, s23, 16 -; GFX11-NEXT: s_lshr_b32 s56, s6, 8 -; GFX11-NEXT: s_lshr_b32 s35, s9, 24 -; GFX11-NEXT: s_lshr_b32 s36, s9, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 20 -; GFX11-NEXT: s_lshr_b32 s43, s23, 8 -; GFX11-NEXT: s_lshr_b32 s37, s9, 8 -; GFX11-NEXT: s_lshr_b32 s38, s8, 16 -; GFX11-NEXT: s_lshr_b32 s39, s8, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 21 -; GFX11-NEXT: s_lshr_b32 s43, s22, 16 -; GFX11-NEXT: s_lshr_b32 s48, s11, 24 -; GFX11-NEXT: s_lshr_b32 s49, s11, 16 -; GFX11-NEXT: s_lshr_b32 s50, s11, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 22 -; GFX11-NEXT: s_lshr_b32 s43, s22, 8 -; GFX11-NEXT: s_lshr_b32 s51, s10, 16 -; GFX11-NEXT: s_lshr_b32 s52, s10, 8 -; GFX11-NEXT: s_lshr_b32 s53, s13, 24 -; GFX11-NEXT: v_writelane_b32 v19, s43, 23 -; GFX11-NEXT: s_lshr_b32 s43, s21, 24 -; GFX11-NEXT: s_lshr_b32 s54, s13, 16 -; GFX11-NEXT: s_lshr_b32 s55, s13, 8 -; GFX11-NEXT: s_lshr_b32 s64, s12, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 24 -; GFX11-NEXT: s_lshr_b32 s43, s21, 16 -; GFX11-NEXT: s_lshr_b32 s65, s12, 8 -; GFX11-NEXT: s_lshr_b32 s66, s15, 24 -; GFX11-NEXT: s_lshr_b32 s67, s15, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 25 -; GFX11-NEXT: s_lshr_b32 s43, s21, 8 -; GFX11-NEXT: s_lshr_b32 s68, s15, 8 -; GFX11-NEXT: s_lshr_b32 s59, s14, 16 -; GFX11-NEXT: s_lshr_b32 s58, s14, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 26 -; GFX11-NEXT: s_lshr_b32 s43, s20, 16 -; GFX11-NEXT: s_lshr_b32 s70, s41, 24 -; GFX11-NEXT: s_lshr_b32 s71, s41, 16 -; GFX11-NEXT: s_lshr_b32 s60, s41, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 27 -; GFX11-NEXT: s_lshr_b32 s43, s20, 8 -; GFX11-NEXT: s_lshr_b32 s80, s40, 16 -; GFX11-NEXT: s_lshr_b32 s61, s40, 8 -; GFX11-NEXT: s_lshr_b32 s81, s29, 24 -; GFX11-NEXT: v_writelane_b32 v19, s43, 28 -; GFX11-NEXT: s_lshr_b32 s43, s19, 24 -; GFX11-NEXT: s_lshr_b32 s82, s29, 16 -; GFX11-NEXT: s_lshr_b32 s83, s29, 8 -; GFX11-NEXT: s_lshr_b32 s84, s28, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 29 -; GFX11-NEXT: s_lshr_b32 s43, s19, 16 -; GFX11-NEXT: s_lshr_b32 s85, s28, 8 -; GFX11-NEXT: s_lshr_b32 s86, s27, 24 -; GFX11-NEXT: s_lshr_b32 s72, s27, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 30 -; GFX11-NEXT: s_lshr_b32 s43, s19, 8 -; GFX11-NEXT: s_lshr_b32 s87, s27, 8 -; GFX11-NEXT: s_lshr_b32 s73, s26, 16 -; GFX11-NEXT: s_lshr_b32 s96, s26, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 31 -; GFX11-NEXT: s_lshr_b32 s43, s18, 16 -; GFX11-NEXT: s_lshr_b32 s97, s25, 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 0 -; GFX11-NEXT: s_lshr_b32 s43, s18, 8 -; GFX11-NEXT: v_writelane_b32 v19, s62, 14 -; GFX11-NEXT: s_lshr_b32 s42, s25, 16 -; GFX11-NEXT: s_lshr_b32 s74, s2, 16 -; GFX11-NEXT: v_writelane_b32 v18, s43, 1 -; GFX11-NEXT: s_lshr_b32 s43, s17, 24 -; GFX11-NEXT: v_writelane_b32 v19, s63, 15 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[6:7], 24 +; GFX11-NEXT: s_lshr_b32 s42, s5, 24 +; GFX11-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; GFX11-NEXT: v_writelane_b32 v19, s42, 4 +; GFX11-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-NEXT: s_lshr_b32 s102, s29, 8 +; GFX11-NEXT: s_lshr_b32 s103, s28, 16 +; GFX11-NEXT: s_lshr_b32 s104, s28, 8 +; GFX11-NEXT: v_writelane_b32 v19, s42, 5 +; GFX11-NEXT: s_lshr_b32 s42, s5, 8 +; GFX11-NEXT: s_lshr_b32 vcc_hi, s27, 24 +; GFX11-NEXT: s_lshr_b32 s34, s27, 16 +; GFX11-NEXT: s_lshr_b32 s35, s27, 8 +; GFX11-NEXT: v_writelane_b32 v19, s42, 6 +; GFX11-NEXT: s_lshr_b32 s42, s4, 16 +; GFX11-NEXT: s_lshr_b32 s36, s26, 16 +; GFX11-NEXT: s_lshr_b32 s37, s26, 8 +; GFX11-NEXT: s_lshr_b32 s38, s25, 24 +; GFX11-NEXT: v_writelane_b32 v19, s42, 7 +; GFX11-NEXT: s_lshr_b32 s42, s4, 8 +; GFX11-NEXT: s_lshr_b32 s39, s25, 16 +; GFX11-NEXT: s_lshr_b32 s48, s25, 8 +; GFX11-NEXT: s_lshr_b32 s49, s24, 16 +; GFX11-NEXT: v_writelane_b32 v19, s42, 8 +; GFX11-NEXT: s_lshr_b32 s42, s7, 24 +; GFX11-NEXT: s_lshr_b32 s50, s24, 8 +; GFX11-NEXT: s_lshr_b32 s51, s23, 24 +; GFX11-NEXT: s_lshr_b32 s52, s23, 16 +; GFX11-NEXT: v_writelane_b32 v19, s42, 9 +; GFX11-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-NEXT: s_lshr_b32 s53, s23, 8 +; GFX11-NEXT: s_lshr_b32 s54, s22, 16 +; GFX11-NEXT: s_lshr_b32 s55, s22, 8 +; GFX11-NEXT: v_writelane_b32 v19, s42, 10 +; GFX11-NEXT: s_lshr_b32 s42, s7, 8 +; GFX11-NEXT: s_lshr_b32 s64, s21, 24 +; GFX11-NEXT: s_lshr_b32 s65, s21, 16 +; GFX11-NEXT: s_lshr_b32 s66, s21, 8 +; GFX11-NEXT: v_writelane_b32 v19, s42, 11 +; GFX11-NEXT: s_lshr_b32 s42, s6, 16 +; GFX11-NEXT: s_lshr_b32 s67, s20, 16 +; GFX11-NEXT: s_lshr_b32 s68, s20, 8 +; GFX11-NEXT: s_lshr_b32 s69, s19, 24 +; GFX11-NEXT: v_writelane_b32 v19, s42, 12 +; GFX11-NEXT: s_lshr_b32 s42, s6, 8 +; GFX11-NEXT: s_lshr_b32 s70, s19, 16 +; GFX11-NEXT: s_lshr_b32 s71, s19, 8 +; GFX11-NEXT: s_lshr_b32 s80, s18, 16 +; GFX11-NEXT: v_writelane_b32 v19, s42, 13 +; GFX11-NEXT: s_lshr_b32 s42, s9, 24 +; GFX11-NEXT: s_lshr_b32 s81, s18, 8 +; GFX11-NEXT: s_lshr_b32 s82, s17, 24 +; GFX11-NEXT: s_lshr_b32 s83, s17, 16 +; GFX11-NEXT: v_writelane_b32 v19, s42, 14 +; GFX11-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-NEXT: s_lshr_b32 s84, s17, 8 +; GFX11-NEXT: s_lshr_b32 s85, s16, 16 +; GFX11-NEXT: s_lshr_b32 s86, s16, 8 +; GFX11-NEXT: v_writelane_b32 v19, s42, 15 +; GFX11-NEXT: s_lshr_b32 s42, s9, 8 +; GFX11-NEXT: s_lshr_b32 s87, s3, 24 +; GFX11-NEXT: s_lshr_b32 s96, s3, 16 +; GFX11-NEXT: s_lshr_b32 s97, s3, 8 +; GFX11-NEXT: v_writelane_b32 v19, s42, 16 +; GFX11-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-NEXT: s_lshr_b32 s43, s2, 8 ; GFX11-NEXT: s_lshr_b32 s98, s1, 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 2 -; GFX11-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-NEXT: v_writelane_b32 v19, s62, 12 ; GFX11-NEXT: s_lshr_b32 s99, s1, 16 +; GFX11-NEXT: v_writelane_b32 v19, s42, 17 +; GFX11-NEXT: s_lshr_b32 s42, s8, 8 ; GFX11-NEXT: s_lshr_b32 s100, s1, 8 -; GFX11-NEXT: v_writelane_b32 v18, s43, 3 -; GFX11-NEXT: s_lshr_b32 s43, s17, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 13 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[8:9], 24 -; GFX11-NEXT: s_lshr_b32 s44, s0, 16 -; GFX11-NEXT: v_writelane_b32 v18, s43, 4 -; GFX11-NEXT: s_lshr_b32 s43, s16, 16 -; GFX11-NEXT: v_writelane_b32 v19, s62, 10 -; GFX11-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-NEXT: s_lshr_b32 s101, s0, 8 +; GFX11-NEXT: v_writelane_b32 v19, s42, 18 +; GFX11-NEXT: s_lshr_b32 s42, s11, 24 +; GFX11-NEXT: s_lshr_b64 s[56:57], s[10:11], 24 +; GFX11-NEXT: s_lshr_b64 s[58:59], s[12:13], 24 +; GFX11-NEXT: s_lshr_b64 s[60:61], s[14:15], 24 +; GFX11-NEXT: v_writelane_b32 v19, s42, 19 +; GFX11-NEXT: s_lshr_b32 s42, s11, 16 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 +; GFX11-NEXT: s_lshr_b64 s[72:73], s[28:29], 24 ; GFX11-NEXT: s_lshr_b64 s[76:77], s[26:27], 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 5 -; GFX11-NEXT: s_lshr_b32 s43, s16, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 11 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 +; GFX11-NEXT: v_writelane_b32 v19, s42, 20 +; GFX11-NEXT: s_lshr_b32 s42, s11, 8 ; GFX11-NEXT: s_lshr_b64 s[88:89], s[24:25], 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 6 -; GFX11-NEXT: s_lshr_b32 s43, s3, 24 -; GFX11-NEXT: v_writelane_b32 v19, s62, 8 +; GFX11-NEXT: s_lshr_b64 s[74:75], s[22:23], 24 ; GFX11-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; GFX11-NEXT: v_writelane_b32 v19, s42, 21 +; GFX11-NEXT: s_lshr_b32 s42, s10, 16 ; GFX11-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 7 -; GFX11-NEXT: s_lshr_b32 s43, s3, 16 -; GFX11-NEXT: v_writelane_b32 v19, s63, 9 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 ; GFX11-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 8 -; GFX11-NEXT: s_lshr_b32 s43, s3, 8 -; GFX11-NEXT: v_writelane_b32 v19, s62, 6 ; GFX11-NEXT: s_lshr_b64 s[94:95], s[2:3], 24 +; GFX11-NEXT: v_writelane_b32 v19, s42, 22 +; GFX11-NEXT: s_lshr_b32 s42, s10, 8 ; GFX11-NEXT: s_lshr_b64 s[30:31], s[0:1], 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 9 -; GFX11-NEXT: s_lshr_b32 s43, s2, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 7 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v19, s62, 4 -; GFX11-NEXT: v_writelane_b32 v19, s63, 5 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 -; GFX11-NEXT: v_writelane_b32 v19, s62, 2 -; GFX11-NEXT: v_writelane_b32 v19, s63, 3 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[28:29], 24 +; GFX11-NEXT: v_writelane_b32 v19, s42, 23 +; GFX11-NEXT: s_lshr_b32 s42, s13, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v19, s42, 24 +; GFX11-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-NEXT: v_writelane_b32 v19, s42, 25 +; GFX11-NEXT: s_lshr_b32 s42, s13, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v19, s42, 26 +; GFX11-NEXT: s_lshr_b32 s42, s12, 16 +; GFX11-NEXT: v_writelane_b32 v19, s42, 27 +; GFX11-NEXT: s_lshr_b32 s42, s12, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v19, s42, 28 +; GFX11-NEXT: s_lshr_b32 s42, s15, 24 +; GFX11-NEXT: v_writelane_b32 v19, s42, 29 +; GFX11-NEXT: s_lshr_b32 s42, s15, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v19, s42, 30 +; GFX11-NEXT: s_lshr_b32 s42, s15, 8 +; GFX11-NEXT: v_writelane_b32 v19, s42, 31 +; GFX11-NEXT: s_lshr_b32 s42, s14, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v19, s62, 0 -; GFX11-NEXT: v_writelane_b32 v19, s63, 1 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 -; GFX11-NEXT: s_branch .LBB13_3 -; GFX11-NEXT: .LBB13_2: -; GFX11-NEXT: ; implicit-def: $vcc_hi -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: s_mov_b32 s101, -1 -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 0 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 1 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 2 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 3 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 4 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 5 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: ; implicit-def: $sgpr45 -; GFX11-NEXT: ; implicit-def: $sgpr44 -; GFX11-NEXT: ; implicit-def: $sgpr30 -; GFX11-NEXT: ; implicit-def: $sgpr100 -; GFX11-NEXT: ; implicit-def: $sgpr99 -; GFX11-NEXT: ; implicit-def: $sgpr98 -; GFX11-NEXT: ; implicit-def: $sgpr43 -; GFX11-NEXT: ; implicit-def: $sgpr74 -; GFX11-NEXT: ; implicit-def: $sgpr94 -; GFX11-NEXT: ; implicit-def: $sgpr92 -; GFX11-NEXT: ; implicit-def: $sgpr90 -; GFX11-NEXT: ; implicit-def: $sgpr78 -; GFX11-NEXT: ; implicit-def: $sgpr62 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr97 -; GFX11-NEXT: ; implicit-def: $sgpr96 -; GFX11-NEXT: ; implicit-def: $sgpr73 -; GFX11-NEXT: ; implicit-def: $sgpr87 -; GFX11-NEXT: ; implicit-def: $sgpr72 -; GFX11-NEXT: ; implicit-def: $sgpr86 -; GFX11-NEXT: ; implicit-def: $sgpr85 -; GFX11-NEXT: ; implicit-def: $sgpr84 -; GFX11-NEXT: ; implicit-def: $sgpr83 -; GFX11-NEXT: ; implicit-def: $sgpr82 -; GFX11-NEXT: ; implicit-def: $sgpr81 -; GFX11-NEXT: ; implicit-def: $sgpr61 -; GFX11-NEXT: ; implicit-def: $sgpr80 -; GFX11-NEXT: ; implicit-def: $sgpr60 -; GFX11-NEXT: ; implicit-def: $sgpr71 -; GFX11-NEXT: ; implicit-def: $sgpr70 -; GFX11-NEXT: ; implicit-def: $sgpr58 -; GFX11-NEXT: ; implicit-def: $sgpr59 -; GFX11-NEXT: ; implicit-def: $sgpr68 -; GFX11-NEXT: ; implicit-def: $sgpr67 -; GFX11-NEXT: ; implicit-def: $sgpr66 -; GFX11-NEXT: ; implicit-def: $sgpr65 -; GFX11-NEXT: ; implicit-def: $sgpr64 -; GFX11-NEXT: ; implicit-def: $sgpr55 -; GFX11-NEXT: ; implicit-def: $sgpr54 -; GFX11-NEXT: ; implicit-def: $sgpr53 -; GFX11-NEXT: ; implicit-def: $sgpr52 -; GFX11-NEXT: ; implicit-def: $sgpr51 -; GFX11-NEXT: ; implicit-def: $sgpr50 -; GFX11-NEXT: ; implicit-def: $sgpr49 -; GFX11-NEXT: ; implicit-def: $sgpr48 -; GFX11-NEXT: ; implicit-def: $sgpr39 -; GFX11-NEXT: ; implicit-def: $sgpr38 -; GFX11-NEXT: ; implicit-def: $sgpr37 -; GFX11-NEXT: ; implicit-def: $sgpr36 -; GFX11-NEXT: ; implicit-def: $sgpr35 -; GFX11-NEXT: ; implicit-def: $sgpr56 -; GFX11-NEXT: ; implicit-def: $sgpr69 -; GFX11-NEXT: ; implicit-def: $sgpr34 -; GFX11-NEXT: ; implicit-def: $sgpr46 -; GFX11-NEXT: ; implicit-def: $sgpr47 -; GFX11-NEXT: ; implicit-def: $sgpr57 -; GFX11-NEXT: ; implicit-def: $sgpr103 -; GFX11-NEXT: ; implicit-def: $sgpr102 -; GFX11-NEXT: ; implicit-def: $sgpr104 -; GFX11-NEXT: ; implicit-def: $sgpr88 -; GFX11-NEXT: ; implicit-def: $sgpr76 -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 6 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 7 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 8 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 9 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 10 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 11 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 12 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 13 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 14 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 15 -; GFX11-NEXT: .LBB13_3: ; %Flow -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s101 -; GFX11-NEXT: s_mov_b32 s101, s104 -; GFX11-NEXT: s_mov_b32 s104, s57 -; GFX11-NEXT: s_mov_b32 s57, s69 -; GFX11-NEXT: s_mov_b32 s69, s42 -; GFX11-NEXT: s_cbranch_vccnz .LBB13_5 -; GFX11-NEXT: ; %bb.4: ; %cmp.true -; GFX11-NEXT: s_add_i32 s25, s25, 3 -; GFX11-NEXT: s_add_i32 s24, s24, 3 -; GFX11-NEXT: s_lshr_b32 s42, s25, 8 -; GFX11-NEXT: s_add_i32 s23, s23, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 16 -; GFX11-NEXT: s_lshr_b32 s42, s24, 16 -; GFX11-NEXT: s_add_i32 s22, s22, 3 -; GFX11-NEXT: s_add_i32 s21, s21, 3 -; GFX11-NEXT: s_add_i32 s20, s20, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 17 -; GFX11-NEXT: s_lshr_b32 s42, s24, 8 -; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: v_writelane_b32 v18, s42, 0 +; GFX11-NEXT: s_lshr_b32 s42, s14, 8 +; GFX11-NEXT: v_writelane_b32 v19, s46, 2 +; GFX11-NEXT: v_writelane_b32 v18, s42, 1 +; GFX11-NEXT: s_lshr_b32 s42, s41, 24 +; GFX11-NEXT: v_writelane_b32 v19, s47, 3 +; GFX11-NEXT: s_lshr_b64 s[46:47], s[6:7], 24 +; GFX11-NEXT: v_writelane_b32 v18, s42, 2 +; GFX11-NEXT: s_lshr_b32 s42, s41, 16 +; GFX11-NEXT: v_writelane_b32 v19, s46, 0 +; GFX11-NEXT: v_writelane_b32 v18, s42, 3 +; GFX11-NEXT: s_lshr_b32 s42, s41, 8 +; GFX11-NEXT: v_writelane_b32 v19, s47, 1 +; GFX11-NEXT: s_lshr_b64 s[46:47], s[8:9], 24 +; GFX11-NEXT: v_writelane_b32 v18, s42, 4 +; GFX11-NEXT: s_lshr_b32 s42, s40, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v18, s42, 5 +; GFX11-NEXT: s_lshr_b32 s42, s40, 8 +; GFX11-NEXT: v_writelane_b32 v18, s42, 6 +; GFX11-NEXT: s_lshr_b32 s42, s29, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v18, s42, 7 +; GFX11-NEXT: s_lshr_b32 s42, s29, 16 +; GFX11-NEXT: v_writelane_b32 v18, s42, 8 +; GFX11-NEXT: s_lshr_b32 s42, s2, 16 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s44 +; GFX11-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-NEXT: .LBB13_2: ; %cmp.true ; GFX11-NEXT: s_add_i32 s5, s5, 3 ; GFX11-NEXT: s_add_i32 s4, s4, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 18 -; GFX11-NEXT: s_lshr_b32 s42, s23, 24 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 +; GFX11-NEXT: s_lshr_b32 s42, s5, 24 ; GFX11-NEXT: s_add_i32 s7, s7, 3 +; GFX11-NEXT: v_writelane_b32 v19, s42, 4 +; GFX11-NEXT: s_lshr_b32 s42, s5, 16 ; GFX11-NEXT: s_add_i32 s6, s6, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 19 -; GFX11-NEXT: s_lshr_b32 s42, s23, 16 ; GFX11-NEXT: s_add_i32 s9, s9, 3 ; GFX11-NEXT: s_add_i32 s8, s8, 3 +; GFX11-NEXT: v_writelane_b32 v19, s42, 5 +; GFX11-NEXT: s_lshr_b32 s42, s5, 8 ; GFX11-NEXT: s_add_i32 s11, s11, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 20 -; GFX11-NEXT: s_lshr_b32 s42, s23, 8 ; GFX11-NEXT: s_add_i32 s10, s10, 3 -; GFX11-NEXT: s_add_i32 s18, s18, 3 ; GFX11-NEXT: s_add_i32 s13, s13, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 21 -; GFX11-NEXT: s_lshr_b32 s42, s22, 16 +; GFX11-NEXT: v_writelane_b32 v19, s42, 6 +; GFX11-NEXT: s_lshr_b32 s42, s4, 16 ; GFX11-NEXT: s_add_i32 s12, s12, 3 -; GFX11-NEXT: s_add_i32 s17, s17, 3 ; GFX11-NEXT: s_add_i32 s15, s15, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 22 -; GFX11-NEXT: s_lshr_b32 s42, s22, 8 ; GFX11-NEXT: s_add_i32 s14, s14, 3 -; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: v_writelane_b32 v19, s42, 7 +; GFX11-NEXT: s_lshr_b32 s42, s4, 8 ; GFX11-NEXT: s_add_i32 s41, s41, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 23 -; GFX11-NEXT: s_lshr_b32 s42, s21, 24 ; GFX11-NEXT: s_add_i32 s40, s40, 3 -; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; GFX11-NEXT: v_writelane_b32 v19, s42, 8 +; GFX11-NEXT: s_lshr_b32 s42, s7, 24 ; GFX11-NEXT: s_add_i32 s29, s29, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 24 -; GFX11-NEXT: s_lshr_b32 s42, s21, 16 -; GFX11-NEXT: s_add_i32 s28, s28, 3 ; GFX11-NEXT: s_add_i32 s1, s1, 3 ; GFX11-NEXT: s_add_i32 s0, s0, 3 -; GFX11-NEXT: v_writelane_b32 v19, s42, 25 -; GFX11-NEXT: s_lshr_b32 s42, s21, 8 +; GFX11-NEXT: v_writelane_b32 v19, s42, 9 +; GFX11-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-NEXT: s_add_i32 s3, s3, 3 ; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: v_writelane_b32 v19, s42, 10 +; GFX11-NEXT: s_lshr_b32 s42, s7, 8 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: v_writelane_b32 v19, s42, 11 +; GFX11-NEXT: s_lshr_b32 s42, s6, 16 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: v_writelane_b32 v19, s42, 12 +; GFX11-NEXT: s_lshr_b32 s42, s6, 8 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: v_writelane_b32 v19, s42, 13 +; GFX11-NEXT: s_lshr_b32 s42, s9, 24 ; GFX11-NEXT: s_add_i32 s27, s27, 3 ; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_add_i32 s28, s28, 3 +; GFX11-NEXT: v_writelane_b32 v19, s42, 14 +; GFX11-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-NEXT: s_lshr_b32 s102, s29, 8 +; GFX11-NEXT: s_lshr_b32 s103, s28, 16 +; GFX11-NEXT: s_lshr_b32 s104, s28, 8 +; GFX11-NEXT: v_writelane_b32 v19, s42, 15 +; GFX11-NEXT: s_lshr_b32 s42, s9, 8 +; GFX11-NEXT: s_lshr_b32 vcc_hi, s27, 24 +; GFX11-NEXT: s_lshr_b32 s34, s27, 16 +; GFX11-NEXT: s_lshr_b32 s35, s27, 8 +; GFX11-NEXT: v_writelane_b32 v19, s42, 16 +; GFX11-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-NEXT: s_lshr_b32 s36, s26, 16 +; GFX11-NEXT: s_lshr_b32 s37, s26, 8 +; GFX11-NEXT: s_lshr_b32 s38, s25, 24 +; GFX11-NEXT: v_writelane_b32 v19, s42, 17 +; GFX11-NEXT: s_lshr_b32 s42, s8, 8 +; GFX11-NEXT: s_lshr_b32 s39, s25, 16 +; GFX11-NEXT: s_lshr_b32 s48, s25, 8 +; GFX11-NEXT: s_lshr_b32 s49, s24, 16 +; GFX11-NEXT: v_writelane_b32 v19, s42, 18 +; GFX11-NEXT: s_lshr_b32 s42, s11, 24 +; GFX11-NEXT: s_lshr_b32 s50, s24, 8 +; GFX11-NEXT: s_lshr_b32 s51, s23, 24 +; GFX11-NEXT: s_lshr_b32 s52, s23, 16 +; GFX11-NEXT: v_writelane_b32 v19, s42, 19 +; GFX11-NEXT: s_lshr_b32 s42, s11, 16 +; GFX11-NEXT: s_lshr_b32 s53, s23, 8 +; GFX11-NEXT: s_lshr_b32 s54, s22, 16 +; GFX11-NEXT: s_lshr_b32 s55, s22, 8 +; GFX11-NEXT: v_writelane_b32 v19, s42, 20 +; GFX11-NEXT: s_lshr_b32 s42, s11, 8 +; GFX11-NEXT: s_lshr_b32 s64, s21, 24 +; GFX11-NEXT: s_lshr_b32 s65, s21, 16 +; GFX11-NEXT: s_lshr_b32 s66, s21, 8 +; GFX11-NEXT: v_writelane_b32 v19, s42, 21 +; GFX11-NEXT: s_lshr_b32 s42, s10, 16 +; GFX11-NEXT: s_lshr_b32 s67, s20, 16 +; GFX11-NEXT: s_lshr_b32 s68, s20, 8 +; GFX11-NEXT: s_lshr_b32 s69, s19, 24 +; GFX11-NEXT: v_writelane_b32 v19, s42, 22 +; GFX11-NEXT: s_lshr_b32 s42, s10, 8 +; GFX11-NEXT: s_lshr_b32 s70, s19, 16 +; GFX11-NEXT: s_lshr_b32 s71, s19, 8 +; GFX11-NEXT: s_lshr_b32 s80, s18, 16 +; GFX11-NEXT: v_writelane_b32 v19, s42, 23 +; GFX11-NEXT: s_lshr_b32 s42, s13, 24 +; GFX11-NEXT: s_lshr_b32 s81, s18, 8 +; GFX11-NEXT: s_lshr_b32 s82, s17, 24 +; GFX11-NEXT: s_lshr_b32 s83, s17, 16 +; GFX11-NEXT: v_writelane_b32 v19, s42, 24 +; GFX11-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-NEXT: s_lshr_b32 s84, s17, 8 +; GFX11-NEXT: s_lshr_b32 s85, s16, 16 +; GFX11-NEXT: s_lshr_b32 s86, s16, 8 +; GFX11-NEXT: v_writelane_b32 v19, s42, 25 +; GFX11-NEXT: s_lshr_b32 s42, s13, 8 +; GFX11-NEXT: s_lshr_b32 s87, s3, 24 +; GFX11-NEXT: s_lshr_b32 s96, s3, 16 +; GFX11-NEXT: s_lshr_b32 s97, s3, 8 ; GFX11-NEXT: v_writelane_b32 v19, s42, 26 -; GFX11-NEXT: s_lshr_b32 s42, s20, 16 -; GFX11-NEXT: s_lshr_b32 s101, s5, 24 -; GFX11-NEXT: s_lshr_b32 s102, s5, 16 -; GFX11-NEXT: s_lshr_b32 s103, s5, 8 +; GFX11-NEXT: s_lshr_b32 s42, s12, 16 +; GFX11-NEXT: s_lshr_b32 s43, s2, 8 +; GFX11-NEXT: s_lshr_b32 s98, s1, 24 +; GFX11-NEXT: s_lshr_b32 s99, s1, 16 ; GFX11-NEXT: v_writelane_b32 v19, s42, 27 -; GFX11-NEXT: s_lshr_b32 s42, s20, 8 -; GFX11-NEXT: s_lshr_b32 s104, s4, 16 -; GFX11-NEXT: s_lshr_b32 s47, s4, 8 -; GFX11-NEXT: s_lshr_b32 s46, s7, 24 +; GFX11-NEXT: s_lshr_b32 s42, s12, 8 +; GFX11-NEXT: s_lshr_b32 s100, s1, 8 +; GFX11-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-NEXT: s_lshr_b32 s101, s0, 8 ; GFX11-NEXT: v_writelane_b32 v19, s42, 28 -; GFX11-NEXT: s_lshr_b32 s42, s19, 24 -; GFX11-NEXT: s_lshr_b32 vcc_hi, s7, 16 -; GFX11-NEXT: s_lshr_b32 s34, s7, 8 -; GFX11-NEXT: s_lshr_b32 s57, s6, 16 +; GFX11-NEXT: s_lshr_b32 s42, s15, 24 +; GFX11-NEXT: s_lshr_b64 s[56:57], s[10:11], 24 +; GFX11-NEXT: s_lshr_b64 s[58:59], s[12:13], 24 +; GFX11-NEXT: s_lshr_b64 s[60:61], s[14:15], 24 ; GFX11-NEXT: v_writelane_b32 v19, s42, 29 -; GFX11-NEXT: s_lshr_b32 s42, s19, 16 -; GFX11-NEXT: s_lshr_b32 s56, s6, 8 -; GFX11-NEXT: s_lshr_b32 s35, s9, 24 -; GFX11-NEXT: s_lshr_b32 s36, s9, 16 +; GFX11-NEXT: s_lshr_b32 s42, s15, 16 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 +; GFX11-NEXT: s_lshr_b64 s[72:73], s[28:29], 24 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[26:27], 24 ; GFX11-NEXT: v_writelane_b32 v19, s42, 30 -; GFX11-NEXT: s_lshr_b32 s42, s19, 8 -; GFX11-NEXT: s_lshr_b32 s37, s9, 8 -; GFX11-NEXT: s_lshr_b32 s38, s8, 16 -; GFX11-NEXT: s_lshr_b32 s39, s8, 8 +; GFX11-NEXT: s_lshr_b32 s42, s15, 8 +; GFX11-NEXT: s_lshr_b64 s[88:89], s[24:25], 24 +; GFX11-NEXT: s_lshr_b64 s[74:75], s[22:23], 24 +; GFX11-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 ; GFX11-NEXT: v_writelane_b32 v19, s42, 31 -; GFX11-NEXT: s_lshr_b32 s42, s18, 16 -; GFX11-NEXT: s_lshr_b32 s48, s11, 24 +; GFX11-NEXT: s_lshr_b32 s42, s14, 16 +; GFX11-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 ; GFX11-NEXT: v_writelane_b32 v18, s42, 0 -; GFX11-NEXT: s_lshr_b32 s42, s18, 8 -; GFX11-NEXT: v_writelane_b32 v19, s62, 14 -; GFX11-NEXT: s_lshr_b32 s49, s11, 16 -; GFX11-NEXT: s_lshr_b32 s50, s11, 8 +; GFX11-NEXT: s_lshr_b32 s42, s14, 8 +; GFX11-NEXT: v_writelane_b32 v19, s46, 2 +; GFX11-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 +; GFX11-NEXT: s_lshr_b64 s[94:95], s[2:3], 24 ; GFX11-NEXT: v_writelane_b32 v18, s42, 1 -; GFX11-NEXT: s_lshr_b32 s42, s17, 24 -; GFX11-NEXT: v_writelane_b32 v19, s63, 15 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[6:7], 24 -; GFX11-NEXT: s_lshr_b32 s51, s10, 16 +; GFX11-NEXT: s_lshr_b32 s42, s41, 24 +; GFX11-NEXT: v_writelane_b32 v19, s47, 3 +; GFX11-NEXT: s_lshr_b64 s[46:47], s[6:7], 24 +; GFX11-NEXT: s_lshr_b64 s[30:31], s[0:1], 24 ; GFX11-NEXT: v_writelane_b32 v18, s42, 2 -; GFX11-NEXT: s_lshr_b32 s42, s17, 16 -; GFX11-NEXT: v_writelane_b32 v19, s62, 12 -; GFX11-NEXT: s_lshr_b32 s52, s10, 8 -; GFX11-NEXT: s_lshr_b32 s53, s13, 24 +; GFX11-NEXT: s_lshr_b32 s42, s41, 16 +; GFX11-NEXT: v_writelane_b32 v19, s46, 0 ; GFX11-NEXT: v_writelane_b32 v18, s42, 3 -; GFX11-NEXT: s_lshr_b32 s42, s17, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 13 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[8:9], 24 -; GFX11-NEXT: s_lshr_b32 s54, s13, 16 +; GFX11-NEXT: s_lshr_b32 s42, s41, 8 +; GFX11-NEXT: v_writelane_b32 v19, s47, 1 +; GFX11-NEXT: s_lshr_b64 s[46:47], s[8:9], 24 ; GFX11-NEXT: v_writelane_b32 v18, s42, 4 -; GFX11-NEXT: s_lshr_b32 s42, s16, 16 -; GFX11-NEXT: v_writelane_b32 v19, s62, 10 -; GFX11-NEXT: s_lshr_b32 s55, s13, 8 -; GFX11-NEXT: s_lshr_b32 s64, s12, 16 +; GFX11-NEXT: s_lshr_b32 s42, s40, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v18, s42, 5 -; GFX11-NEXT: s_lshr_b32 s42, s16, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 11 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 -; GFX11-NEXT: s_lshr_b32 s65, s12, 8 +; GFX11-NEXT: s_lshr_b32 s42, s40, 8 ; GFX11-NEXT: v_writelane_b32 v18, s42, 6 -; GFX11-NEXT: s_lshr_b32 s42, s3, 24 -; GFX11-NEXT: v_writelane_b32 v19, s62, 8 -; GFX11-NEXT: s_lshr_b32 s66, s15, 24 -; GFX11-NEXT: s_lshr_b32 s67, s15, 16 +; GFX11-NEXT: s_lshr_b32 s42, s29, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v18, s42, 7 -; GFX11-NEXT: s_lshr_b32 s42, s3, 16 -; GFX11-NEXT: v_writelane_b32 v19, s63, 9 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 -; GFX11-NEXT: s_lshr_b32 s68, s15, 8 +; GFX11-NEXT: s_lshr_b32 s42, s29, 16 ; GFX11-NEXT: v_writelane_b32 v18, s42, 8 -; GFX11-NEXT: s_lshr_b32 s59, s14, 16 -; GFX11-NEXT: v_writelane_b32 v19, s62, 6 -; GFX11-NEXT: s_lshr_b32 s58, s14, 8 -; GFX11-NEXT: s_lshr_b32 s70, s41, 24 -; GFX11-NEXT: s_lshr_b32 s71, s41, 16 -; GFX11-NEXT: s_lshr_b32 s60, s41, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 7 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 -; GFX11-NEXT: s_lshr_b32 s80, s40, 16 -; GFX11-NEXT: s_lshr_b32 s61, s40, 8 -; GFX11-NEXT: s_lshr_b32 s81, s29, 24 -; GFX11-NEXT: v_writelane_b32 v19, s62, 4 -; GFX11-NEXT: s_lshr_b32 s82, s29, 16 -; GFX11-NEXT: s_lshr_b32 s83, s29, 8 -; GFX11-NEXT: s_lshr_b32 s84, s28, 16 -; GFX11-NEXT: s_lshr_b32 s85, s28, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 5 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 -; GFX11-NEXT: s_lshr_b32 s86, s27, 24 -; GFX11-NEXT: s_lshr_b32 s72, s27, 16 -; GFX11-NEXT: s_lshr_b32 s87, s27, 8 -; GFX11-NEXT: v_writelane_b32 v19, s62, 2 -; GFX11-NEXT: s_lshr_b32 s73, s26, 16 -; GFX11-NEXT: s_lshr_b32 s96, s26, 8 -; GFX11-NEXT: s_lshr_b32 s97, s25, 24 -; GFX11-NEXT: s_lshr_b32 s69, s25, 16 -; GFX11-NEXT: v_writelane_b32 v19, s63, 3 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[28:29], 24 -; GFX11-NEXT: s_lshr_b32 s42, s3, 8 -; GFX11-NEXT: s_lshr_b32 s74, s2, 16 -; GFX11-NEXT: s_lshr_b32 s43, s2, 8 -; GFX11-NEXT: v_writelane_b32 v19, s62, 0 -; GFX11-NEXT: s_lshr_b32 s98, s1, 24 -; GFX11-NEXT: s_lshr_b32 s99, s1, 16 -; GFX11-NEXT: s_lshr_b32 s100, s1, 8 -; GFX11-NEXT: s_lshr_b32 s44, s0, 16 -; GFX11-NEXT: s_lshr_b32 s45, s0, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 1 -; GFX11-NEXT: s_lshr_b64 s[76:77], s[26:27], 24 -; GFX11-NEXT: s_lshr_b64 s[88:89], s[24:25], 24 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 -; GFX11-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 -; GFX11-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 -; GFX11-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 -; GFX11-NEXT: s_lshr_b64 s[94:95], s[2:3], 24 -; GFX11-NEXT: s_lshr_b64 s[30:31], s[0:1], 24 -; GFX11-NEXT: v_writelane_b32 v18, s42, 9 -; GFX11-NEXT: .LBB13_5: ; %end +; GFX11-NEXT: s_lshr_b32 s42, s2, 16 +; GFX11-NEXT: .LBB13_3: ; %end +; GFX11-NEXT: s_lshl_b32 s44, s101, 8 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_and_b32 s45, s45, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s44 +; GFX11-NEXT: s_lshl_b32 s44, s30, 8 ; GFX11-NEXT: s_lshl_b32 s43, s43, 8 +; GFX11-NEXT: s_or_b32 s44, s45, s44 ; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_and_b32 s42, s74, 0xff -; GFX11-NEXT: s_or_b32 s2, s2, s43 -; GFX11-NEXT: s_lshl_b32 s43, s94, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_or_b32 s42, s42, s43 -; GFX11-NEXT: s_lshl_b32 s45, s45, 8 -; GFX11-NEXT: s_lshl_b32 s42, s42, 16 -; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_or_b32 s2, s2, s42 -; GFX11-NEXT: v_readlane_b32 s42, v18, 9 -; GFX11-NEXT: s_or_b32 s0, s0, s45 -; GFX11-NEXT: s_lshl_b32 s45, s30, 8 -; GFX11-NEXT: s_and_b32 s44, s44, 0xff -; GFX11-NEXT: s_and_b32 s3, s3, 0xff -; GFX11-NEXT: s_or_b32 s44, s44, s45 -; GFX11-NEXT: s_lshl_b32 s42, s42, 8 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s44, s44, 16 -; GFX11-NEXT: s_or_b32 s3, s3, s42 -; GFX11-NEXT: v_readlane_b32 s42, v18, 8 -; GFX11-NEXT: v_readlane_b32 s43, v18, 7 +; GFX11-NEXT: s_or_b32 s2, s2, s43 +; GFX11-NEXT: s_lshl_b32 s43, s94, 8 +; GFX11-NEXT: s_and_b32 s42, s42, 0xff ; GFX11-NEXT: s_or_b32 s0, s0, s44 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_lshl_b32 s44, s100, 8 -; GFX11-NEXT: s_lshl_b32 s45, s98, 8 +; GFX11-NEXT: s_or_b32 s42, s42, s43 ; GFX11-NEXT: s_or_b32 s1, s1, s44 ; GFX11-NEXT: s_and_b32 s44, s99, 0xff -; GFX11-NEXT: s_and_b32 s42, s42, 0xff +; GFX11-NEXT: s_lshl_b32 s45, s98, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s42, s42, 16 ; GFX11-NEXT: s_or_b32 s44, s44, s45 -; GFX11-NEXT: s_lshl_b32 s43, s43, 8 +; GFX11-NEXT: s_or_b32 s2, s2, s42 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_lshl_b32 s42, s97, 8 ; GFX11-NEXT: s_and_b32 s1, s1, 0xffff ; GFX11-NEXT: s_lshl_b32 s44, s44, 16 -; GFX11-NEXT: s_or_b32 s42, s42, s43 +; GFX11-NEXT: s_or_b32 s3, s3, s42 +; GFX11-NEXT: s_and_b32 s42, s96, 0xff +; GFX11-NEXT: s_lshl_b32 s43, s87, 8 ; GFX11-NEXT: s_or_b32 s1, s1, s44 +; GFX11-NEXT: s_or_b32 s42, s42, s43 ; GFX11-NEXT: s_and_b32 s3, s3, 0xffff ; GFX11-NEXT: s_lshl_b32 s42, s42, 16 ; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 -; GFX11-NEXT: v_readlane_b32 s0, v18, 6 +; GFX11-NEXT: s_lshl_b32 s0, s86, 8 +; GFX11-NEXT: s_and_b32 s1, s16, 0xff ; GFX11-NEXT: s_or_b32 s3, s3, s42 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 -; GFX11-NEXT: v_readlane_b32 s2, v18, 5 -; GFX11-NEXT: s_lshl_b32 s0, s0, 8 -; GFX11-NEXT: s_and_b32 s1, s16, 0xff -; GFX11-NEXT: v_readlane_b32 s3, v18, 2 ; GFX11-NEXT: s_or_b32 s0, s1, s0 ; GFX11-NEXT: s_lshl_b32 s1, s92, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_and_b32 s2, s85, 0xff ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_or_b32 s1, s2, s1 -; GFX11-NEXT: v_readlane_b32 s2, v18, 4 +; GFX11-NEXT: s_lshl_b32 s2, s84, 8 ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_lshl_b32 s3, s82, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_and_b32 s1, s17, 0xff -; GFX11-NEXT: s_lshl_b32 s2, s2, 8 -; GFX11-NEXT: v_readlane_b32 s16, v18, 0 +; GFX11-NEXT: s_and_b32 s16, s80, 0xff ; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: v_readlane_b32 s2, v18, 3 +; GFX11-NEXT: s_and_b32 s2, s83, 0xff ; GFX11-NEXT: s_and_b32 s1, s1, 0xffff -; GFX11-NEXT: v_readlane_b32 s17, v19, 29 -; GFX11-NEXT: s_and_b32 s16, s16, 0xff -; GFX11-NEXT: v_readlane_b32 s100, v17, 4 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: v_readlane_b32 s99, v17, 3 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_and_b32 s3, s18, 0xff ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-NEXT: s_lshl_b32 s17, s69, 8 ; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: v_readlane_b32 s2, v18, 1 +; GFX11-NEXT: s_lshl_b32 s2, s81, 8 ; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 -; GFX11-NEXT: v_readlane_b32 s0, v19, 28 -; GFX11-NEXT: s_and_b32 s1, s20, 0xff -; GFX11-NEXT: s_lshl_b32 s2, s2, 8 -; GFX11-NEXT: v_readlane_b32 s18, v19, 19 ; GFX11-NEXT: s_or_b32 s2, s3, s2 ; GFX11-NEXT: s_lshl_b32 s3, s90, 8 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_or_b32 s3, s16, s3 -; GFX11-NEXT: v_readlane_b32 s16, v19, 31 +; GFX11-NEXT: s_lshl_b32 s16, s71, 8 ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_lshl_b32 s0, s0, 8 +; GFX11-NEXT: s_lshl_b32 s0, s68, 8 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_and_b32 s3, s19, 0xff -; GFX11-NEXT: s_lshl_b32 s16, s16, 8 -; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s20, 0xff ; GFX11-NEXT: s_or_b32 s3, s3, s16 -; GFX11-NEXT: v_readlane_b32 s16, v19, 30 +; GFX11-NEXT: s_and_b32 s16, s70, 0xff ; GFX11-NEXT: s_and_b32 s3, s3, 0xffff -; GFX11-NEXT: s_lshl_b32 s1, s78, 8 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-NEXT: s_lshl_b32 s18, s18, 8 -; GFX11-NEXT: s_and_b32 s16, s16, 0xff -; GFX11-NEXT: s_lshl_b32 s19, s86, 8 ; GFX11-NEXT: s_or_b32 s16, s16, s17 -; GFX11-NEXT: v_readlane_b32 s17, v19, 21 +; GFX11-NEXT: s_or_b32 s0, s1, s0 ; GFX11-NEXT: s_lshl_b32 s16, s16, 16 -; GFX11-NEXT: v_readlane_b32 s98, v17, 2 +; GFX11-NEXT: s_lshl_b32 s1, s78, 8 ; GFX11-NEXT: s_or_b32 s3, s3, s16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 -; GFX11-NEXT: v_readlane_b32 s2, v19, 27 -; GFX11-NEXT: v_readlane_b32 s3, v19, 24 -; GFX11-NEXT: v_readlane_b32 s16, v19, 22 -; GFX11-NEXT: s_lshl_b32 s17, s17, 8 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off -; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_and_b32 s2, s67, 0xff +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_or_b32 s1, s2, s1 -; GFX11-NEXT: v_readlane_b32 s2, v19, 26 +; GFX11-NEXT: s_lshl_b32 s2, s66, 8 ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s64, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_and_b32 s1, s21, 0xff -; GFX11-NEXT: s_lshl_b32 s2, s2, 8 -; GFX11-NEXT: v_readlane_b32 s86, v16, 30 +; GFX11-NEXT: s_and_b32 s16, s54, 0xff ; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: v_readlane_b32 s2, v19, 25 +; GFX11-NEXT: s_and_b32 s2, s65, 0xff ; GFX11-NEXT: s_and_b32 s1, s1, 0xffff -; GFX11-NEXT: v_readlane_b32 s31, v16, 1 -; GFX11-NEXT: v_readlane_b32 s30, v16, 0 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_and_b32 s3, s22, 0xff ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_lshl_b32 s17, s53, 8 ; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: v_readlane_b32 s2, v19, 23 -; GFX11-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 -; GFX11-NEXT: v_readlane_b32 s1, v19, 18 -; GFX11-NEXT: s_and_b32 s0, s24, 0xff -; GFX11-NEXT: s_lshl_b32 s2, s2, 8 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s2, s55, 8 +; GFX11-NEXT: s_lshl_b32 s18, s51, 8 ; GFX11-NEXT: s_or_b32 s2, s3, s2 -; GFX11-NEXT: s_lshl_b32 s3, s62, 8 +; GFX11-NEXT: s_lshl_b32 s3, s74, 8 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_or_b32 s3, s16, s3 ; GFX11-NEXT: s_and_b32 s16, s23, 0xff ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 ; GFX11-NEXT: s_or_b32 s16, s16, s17 -; GFX11-NEXT: v_readlane_b32 s17, v19, 20 +; GFX11-NEXT: s_and_b32 s17, s52, 0xff ; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_and_b32 s3, s16, 0xffff -; GFX11-NEXT: s_lshl_b32 s1, s1, 8 -; GFX11-NEXT: s_and_b32 s17, s17, 0xff -; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s17, s17, s18 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_and_b32 s3, s16, 0xffff ; GFX11-NEXT: s_lshl_b32 s16, s17, 16 -; GFX11-NEXT: s_lshl_b32 s17, s97, 8 +; GFX11-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 ; GFX11-NEXT: s_or_b32 s3, s3, s16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 -; GFX11-NEXT: v_readlane_b32 s2, v19, 17 +; GFX11-NEXT: s_and_b32 s0, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s50, 8 +; GFX11-NEXT: s_and_b32 s2, s49, 0xff ; GFX11-NEXT: s_lshl_b32 s3, s88, 8 -; GFX11-NEXT: s_and_b32 s16, s69, 0xff -; GFX11-NEXT: s_and_b32 s18, s72, 0xff -; GFX11-NEXT: v_readlane_b32 s97, v17, 1 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: v_readlane_b32 s69, v16, 21 +; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: v_readlane_b32 s3, v19, 16 ; GFX11-NEXT: s_and_b32 s2, s25, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_lshl_b32 s3, s48, 8 +; GFX11-NEXT: s_and_b32 s16, s39, 0xff +; GFX11-NEXT: s_lshl_b32 s17, s38, 8 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s16, s17 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_and_b32 s16, s73, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s26, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s96, 8 +; GFX11-NEXT: s_lshl_b32 s3, s37, 8 +; GFX11-NEXT: s_and_b32 s16, s36, 0xff ; GFX11-NEXT: s_lshl_b32 s17, s76, 8 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s16, s17 ; GFX11-NEXT: s_and_b32 s16, s27, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s87, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s17, s35, 8 +; GFX11-NEXT: s_and_b32 s18, s34, 0xff +; GFX11-NEXT: s_lshl_b32 s19, vcc_hi, 8 ; GFX11-NEXT: s_or_b32 s16, s16, s17 ; GFX11-NEXT: s_or_b32 s17, s18, s19 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 ; GFX11-NEXT: s_and_b32 s16, s16, 0xffff ; GFX11-NEXT: s_lshl_b32 s17, s17, 16 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: v_readlane_b32 s16, v19, 0 +; GFX11-NEXT: v_readlane_b32 s16, v18, 8 +; GFX11-NEXT: v_readlane_b32 s17, v18, 7 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 ; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 ; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 ; GFX11-NEXT: s_and_b32 s0, s28, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s85, 8 -; GFX11-NEXT: s_and_b32 s2, s84, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s16, 8 -; GFX11-NEXT: v_readlane_b32 s17, v19, 1 +; GFX11-NEXT: s_lshl_b32 s1, s104, 8 +; GFX11-NEXT: s_and_b32 s2, s103, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s72, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s29, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s83, 8 -; GFX11-NEXT: s_and_b32 s16, s82, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s81, 8 -; GFX11-NEXT: v_readlane_b32 s18, v19, 2 +; GFX11-NEXT: s_lshl_b32 s3, s102, 8 +; GFX11-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s17, s17, 8 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s16, s17 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff @@ -11181,147 +11002,176 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 +; GFX11-NEXT: v_readlane_b32 s3, v18, 6 +; GFX11-NEXT: v_readlane_b32 s16, v18, 5 ; GFX11-NEXT: s_and_b32 s2, s40, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s61, 8 -; GFX11-NEXT: s_and_b32 s16, s80, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s18, 8 -; GFX11-NEXT: v_readlane_b32 s19, v19, 3 +; GFX11-NEXT: s_lshl_b32 s17, s62, 8 +; GFX11-NEXT: v_readlane_b32 s18, v18, 3 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_and_b32 s16, s16, 0xff ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s16, s17 +; GFX11-NEXT: v_readlane_b32 s17, v18, 4 +; GFX11-NEXT: v_readlane_b32 s19, v18, 2 ; GFX11-NEXT: s_and_b32 s16, s41, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s60, 8 -; GFX11-NEXT: s_and_b32 s18, s71, 0xff -; GFX11-NEXT: s_lshl_b32 s19, s70, 8 -; GFX11-NEXT: s_or_b32 s16, s16, s17 -; GFX11-NEXT: s_or_b32 s17, s18, s19 +; GFX11-NEXT: s_and_b32 s18, s18, 0xff ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-NEXT: s_lshl_b32 s19, s19, 8 ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s16, s16, s17 +; GFX11-NEXT: s_or_b32 s17, s18, s19 ; GFX11-NEXT: s_and_b32 s16, s16, 0xffff ; GFX11-NEXT: s_lshl_b32 s17, s17, 16 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: v_readlane_b32 s16, v19, 4 ; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 ; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-NEXT: v_readlane_b32 s1, v18, 1 +; GFX11-NEXT: v_readlane_b32 s2, v18, 0 ; GFX11-NEXT: s_and_b32 s0, s14, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s58, 8 -; GFX11-NEXT: s_and_b32 s2, s59, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s16, 8 +; GFX11-NEXT: s_lshl_b32 s3, s60, 8 +; GFX11-NEXT: v_readlane_b32 s14, v19, 30 +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s15, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s68, 8 -; GFX11-NEXT: s_and_b32 s14, s67, 0xff -; GFX11-NEXT: s_lshl_b32 s15, s66, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s14, s15 -; GFX11-NEXT: v_readlane_b32 s14, v19, 6 +; GFX11-NEXT: v_readlane_b32 s3, v19, 31 +; GFX11-NEXT: v_readlane_b32 s15, v19, 29 +; GFX11-NEXT: s_and_b32 s14, s14, 0xff ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_lshl_b32 s15, s15, 8 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_or_b32 s3, s14, s15 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s12, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s65, 8 -; GFX11-NEXT: s_and_b32 s12, s64, 0xff -; GFX11-NEXT: s_lshl_b32 s14, s14, 8 -; GFX11-NEXT: v_readlane_b32 s15, v19, 7 +; GFX11-NEXT: v_readlane_b32 s3, v19, 28 +; GFX11-NEXT: v_readlane_b32 s12, v19, 27 +; GFX11-NEXT: s_lshl_b32 s14, s58, 8 +; GFX11-NEXT: v_readlane_b32 s15, v19, 24 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_and_b32 s12, s12, 0xff ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s12, s14 ; GFX11-NEXT: s_and_b32 s12, s13, 0xff -; GFX11-NEXT: s_lshl_b32 s13, s55, 8 -; GFX11-NEXT: s_and_b32 s14, s54, 0xff -; GFX11-NEXT: s_lshl_b32 s15, s53, 8 -; GFX11-NEXT: s_or_b32 s12, s12, s13 -; GFX11-NEXT: s_or_b32 s13, s14, s15 +; GFX11-NEXT: v_readlane_b32 s13, v19, 26 +; GFX11-NEXT: v_readlane_b32 s14, v19, 25 +; GFX11-NEXT: s_lshl_b32 s15, s15, 8 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_lshl_b32 s13, s13, 8 +; GFX11-NEXT: s_and_b32 s14, s14, 0xff +; GFX11-NEXT: s_or_b32 s12, s12, s13 +; GFX11-NEXT: s_or_b32 s13, s14, s15 ; GFX11-NEXT: s_and_b32 s12, s12, 0xffff ; GFX11-NEXT: s_lshl_b32 s13, s13, 16 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s12, s13 -; GFX11-NEXT: v_readlane_b32 s12, v19, 8 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 ; GFX11-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 ; GFX11-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-NEXT: v_readlane_b32 s1, v19, 23 +; GFX11-NEXT: v_readlane_b32 s2, v19, 22 ; GFX11-NEXT: s_and_b32 s0, s10, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s52, 8 -; GFX11-NEXT: s_and_b32 s2, s51, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s12, 8 +; GFX11-NEXT: s_lshl_b32 s3, s56, 8 +; GFX11-NEXT: v_readlane_b32 s10, v19, 20 +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s11, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s50, 8 -; GFX11-NEXT: s_and_b32 s10, s49, 0xff -; GFX11-NEXT: s_lshl_b32 s11, s48, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s10, s11 -; GFX11-NEXT: v_readlane_b32 s10, v19, 10 +; GFX11-NEXT: v_readlane_b32 s3, v19, 21 +; GFX11-NEXT: v_readlane_b32 s11, v19, 19 +; GFX11-NEXT: s_and_b32 s10, s10, 0xff ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_or_b32 s3, s10, s11 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s8, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s39, 8 -; GFX11-NEXT: s_and_b32 s8, s38, 0xff -; GFX11-NEXT: s_lshl_b32 s10, s10, 8 -; GFX11-NEXT: v_readlane_b32 s11, v19, 11 +; GFX11-NEXT: v_readlane_b32 s3, v19, 18 +; GFX11-NEXT: v_readlane_b32 s8, v19, 17 +; GFX11-NEXT: s_lshl_b32 s10, s46, 8 +; GFX11-NEXT: v_readlane_b32 s11, v19, 14 +; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_and_b32 s8, s8, 0xff ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s8, s10 ; GFX11-NEXT: s_and_b32 s8, s9, 0xff -; GFX11-NEXT: s_lshl_b32 s9, s37, 8 -; GFX11-NEXT: s_and_b32 s10, s36, 0xff -; GFX11-NEXT: s_lshl_b32 s11, s35, 8 -; GFX11-NEXT: s_or_b32 s8, s8, s9 -; GFX11-NEXT: s_or_b32 s9, s10, s11 +; GFX11-NEXT: v_readlane_b32 s9, v19, 16 +; GFX11-NEXT: v_readlane_b32 s10, v19, 15 +; GFX11-NEXT: s_lshl_b32 s11, s11, 8 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-NEXT: s_or_b32 s8, s8, s9 +; GFX11-NEXT: s_or_b32 s9, s10, s11 ; GFX11-NEXT: s_and_b32 s8, s8, 0xffff ; GFX11-NEXT: s_lshl_b32 s9, s9, 16 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s8, s9 -; GFX11-NEXT: v_readlane_b32 s8, v19, 12 -; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-NEXT: v_readlane_b32 s1, v19, 13 +; GFX11-NEXT: v_readlane_b32 s2, v19, 12 +; GFX11-NEXT: v_readlane_b32 s8, v19, 0 ; GFX11-NEXT: s_and_b32 s0, s6, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s56, 8 -; GFX11-NEXT: s_and_b32 s2, s57, 0xff +; GFX11-NEXT: v_readlane_b32 s6, v19, 10 +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff ; GFX11-NEXT: s_lshl_b32 s3, s8, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s7, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s34, 8 -; GFX11-NEXT: s_and_b32 s6, vcc_hi, 0xff -; GFX11-NEXT: s_lshl_b32 s7, s46, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s6, s7 -; GFX11-NEXT: v_readlane_b32 s6, v19, 14 +; GFX11-NEXT: v_readlane_b32 s3, v19, 11 +; GFX11-NEXT: v_readlane_b32 s7, v19, 9 +; GFX11-NEXT: s_and_b32 s6, s6, 0xff ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_or_b32 s3, s6, s7 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s4, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s47, 8 -; GFX11-NEXT: s_and_b32 s4, s104, 0xff +; GFX11-NEXT: v_readlane_b32 s3, v19, 8 +; GFX11-NEXT: v_readlane_b32 s4, v19, 7 +; GFX11-NEXT: v_readlane_b32 s6, v19, 2 +; GFX11-NEXT: v_readlane_b32 s7, v19, 3 +; GFX11-NEXT: v_readlane_b32 s7, v19, 4 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_and_b32 s4, s4, 0xff ; GFX11-NEXT: s_lshl_b32 s6, s6, 8 -; GFX11-NEXT: v_readlane_b32 s7, v19, 15 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s4, s6 ; GFX11-NEXT: s_and_b32 s4, s5, 0xff -; GFX11-NEXT: s_lshl_b32 s5, s103, 8 -; GFX11-NEXT: s_and_b32 s6, s102, 0xff -; GFX11-NEXT: s_lshl_b32 s7, s101, 8 -; GFX11-NEXT: s_or_b32 s4, s4, s5 -; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: v_readlane_b32 s5, v19, 6 +; GFX11-NEXT: v_readlane_b32 s6, v19, 5 +; GFX11-NEXT: s_lshl_b32 s7, s7, 8 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 ; GFX11-NEXT: s_and_b32 s4, s4, 0xffff ; GFX11-NEXT: s_lshl_b32 s5, s5, 16 ; GFX11-NEXT: s_or_b32 s2, s2, s3 @@ -11329,9 +11179,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:64 ; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 ; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 -; GFX11-NEXT: v_readlane_b32 s17, v19, 5 -; GFX11-NEXT: v_readlane_b32 s13, v19, 9 -; GFX11-NEXT: v_readlane_b32 s9, v19, 13 +; GFX11-NEXT: v_readlane_b32 s9, v19, 1 ; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:80 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:96 @@ -11340,8 +11188,13 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: v_readlane_b32 s103, v17, 7 ; GFX11-NEXT: v_readlane_b32 s102, v17, 6 ; GFX11-NEXT: v_readlane_b32 s101, v17, 5 +; GFX11-NEXT: v_readlane_b32 s100, v17, 4 +; GFX11-NEXT: v_readlane_b32 s99, v17, 3 +; GFX11-NEXT: v_readlane_b32 s98, v17, 2 +; GFX11-NEXT: v_readlane_b32 s97, v17, 1 ; GFX11-NEXT: v_readlane_b32 s96, v17, 0 ; GFX11-NEXT: v_readlane_b32 s87, v16, 31 +; GFX11-NEXT: v_readlane_b32 s86, v16, 30 ; GFX11-NEXT: v_readlane_b32 s85, v16, 29 ; GFX11-NEXT: v_readlane_b32 s84, v16, 28 ; GFX11-NEXT: v_readlane_b32 s83, v16, 27 @@ -11350,6 +11203,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: v_readlane_b32 s80, v16, 24 ; GFX11-NEXT: v_readlane_b32 s71, v16, 23 ; GFX11-NEXT: v_readlane_b32 s70, v16, 22 +; GFX11-NEXT: v_readlane_b32 s69, v16, 21 ; GFX11-NEXT: v_readlane_b32 s68, v16, 20 ; GFX11-NEXT: v_readlane_b32 s67, v16, 19 ; GFX11-NEXT: v_readlane_b32 s66, v16, 18 @@ -11369,6 +11223,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: v_readlane_b32 s36, v16, 4 ; GFX11-NEXT: v_readlane_b32 s35, v16, 3 ; GFX11-NEXT: v_readlane_b32 s34, v16, 2 +; GFX11-NEXT: v_readlane_b32 s31, v16, 1 +; GFX11-NEXT: v_readlane_b32 s30, v16, 0 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_load_b32 v16, off, s32 @@ -11378,6 +11234,145 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_4: +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $vcc_hi +; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 0 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr101 +; GFX11-NEXT: ; implicit-def: $sgpr45 +; GFX11-NEXT: ; implicit-def: $sgpr30 +; GFX11-NEXT: ; implicit-def: $sgpr100 +; GFX11-NEXT: ; implicit-def: $sgpr99 +; GFX11-NEXT: ; implicit-def: $sgpr98 +; GFX11-NEXT: ; implicit-def: $sgpr43 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr94 +; GFX11-NEXT: ; implicit-def: $sgpr97 +; GFX11-NEXT: ; implicit-def: $sgpr96 +; GFX11-NEXT: ; implicit-def: $sgpr87 +; GFX11-NEXT: ; implicit-def: $sgpr86 +; GFX11-NEXT: ; implicit-def: $sgpr85 +; GFX11-NEXT: ; implicit-def: $sgpr92 +; GFX11-NEXT: ; implicit-def: $sgpr84 +; GFX11-NEXT: ; implicit-def: $sgpr83 +; GFX11-NEXT: ; implicit-def: $sgpr82 +; GFX11-NEXT: ; implicit-def: $sgpr81 +; GFX11-NEXT: ; implicit-def: $sgpr80 +; GFX11-NEXT: ; implicit-def: $sgpr90 +; GFX11-NEXT: ; implicit-def: $sgpr71 +; GFX11-NEXT: ; implicit-def: $sgpr70 +; GFX11-NEXT: ; implicit-def: $sgpr69 +; GFX11-NEXT: ; implicit-def: $sgpr68 +; GFX11-NEXT: ; implicit-def: $sgpr67 +; GFX11-NEXT: ; implicit-def: $sgpr78 +; GFX11-NEXT: ; implicit-def: $sgpr66 +; GFX11-NEXT: ; implicit-def: $sgpr65 +; GFX11-NEXT: ; implicit-def: $sgpr64 +; GFX11-NEXT: ; implicit-def: $sgpr55 +; GFX11-NEXT: ; implicit-def: $sgpr54 +; GFX11-NEXT: ; implicit-def: $sgpr74 +; GFX11-NEXT: ; implicit-def: $sgpr53 +; GFX11-NEXT: ; implicit-def: $sgpr52 +; GFX11-NEXT: ; implicit-def: $sgpr51 +; GFX11-NEXT: ; implicit-def: $sgpr50 +; GFX11-NEXT: ; implicit-def: $sgpr49 +; GFX11-NEXT: ; implicit-def: $sgpr48 +; GFX11-NEXT: ; implicit-def: $sgpr39 +; GFX11-NEXT: ; implicit-def: $sgpr38 +; GFX11-NEXT: ; implicit-def: $sgpr37 +; GFX11-NEXT: ; implicit-def: $sgpr36 +; GFX11-NEXT: ; implicit-def: $sgpr35 +; GFX11-NEXT: ; implicit-def: $sgpr34 +; GFX11-NEXT: ; implicit-def: $sgpr104 +; GFX11-NEXT: ; implicit-def: $sgpr103 +; GFX11-NEXT: ; implicit-def: $sgpr102 +; GFX11-NEXT: ; implicit-def: $sgpr88 +; GFX11-NEXT: ; implicit-def: $sgpr76 +; GFX11-NEXT: ; implicit-def: $sgpr72 +; GFX11-NEXT: ; implicit-def: $sgpr62 +; GFX11-NEXT: ; implicit-def: $sgpr60 +; GFX11-NEXT: ; implicit-def: $sgpr58 +; GFX11-NEXT: ; implicit-def: $sgpr56 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 1 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 2 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 3 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: s_branch .LBB13_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -18701,7 +18696,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 ; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:8 @@ -18727,7 +18722,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 @@ -18736,11 +18731,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v5 -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v12, 8, v11 ; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v13 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 @@ -18761,7 +18756,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 @@ -18769,22 +18764,22 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 @@ -18807,23 +18802,24 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v38 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v9 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 @@ -18836,37 +18832,37 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v13 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v3 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 ; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v9 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:76 ; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 @@ -18874,23 +18870,23 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:100 ; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:108 ; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:124 ; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:140 ; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:156 ; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:164 ; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:188 ; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:212 ; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 ; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 ; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:252 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:260 ; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:268 ; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276 @@ -18903,57 +18899,57 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB15_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v2, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -18968,12 +18964,12 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -19002,9 +18998,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v29, v9 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload @@ -19027,15 +19021,15 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v50, v0 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -19045,18 +19039,18 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v59, v0 ; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v56, v0 -; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v39, v0 ; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -19064,7 +19058,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v38, v1 ; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v37, v0 @@ -19072,8 +19066,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v36, v0 ; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -19085,39 +19079,41 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v33, v0 -; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v1, v21, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v51, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v34, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v22, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v25, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v34, v22 ; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v43, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v30, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v32, v23 ; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v43, v49 -; VI-NEXT: v_or_b32_sdwa v0, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v32, v54 -; VI-NEXT: v_mov_b32_e32 v34, v26 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v51, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v43, v0 +; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v49, v1 -; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v54, v0 @@ -19127,28 +19123,26 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v46, v61 ; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v45, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v58, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v58, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v45, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v47, v45 ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v58, v44 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v48, v0 -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v63, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v48, v28 +; VI-NEXT: v_mov_b32_e32 v47, v58 +; VI-NEXT: v_mov_b32_e32 v45, v44 +; VI-NEXT: v_mov_b32_e32 v63, v42 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v42, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v63, v42 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -19164,8 +19158,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v57, v0 ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -19199,44 +19193,43 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB15_3 ; VI-NEXT: .LBB15_2: -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v32, v54 -; VI-NEXT: v_mov_b32_e32 v43, v49 +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v46, v61 -; VI-NEXT: v_mov_b32_e32 v47, v45 -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v34, v26 -; VI-NEXT: v_mov_b32_e32 v58, v44 +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v34, v22 +; VI-NEXT: v_mov_b32_e32 v32, v23 +; VI-NEXT: v_mov_b32_e32 v47, v58 +; VI-NEXT: v_mov_b32_e32 v45, v44 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_mov_b32_e32 v63, v42 ; VI-NEXT: v_mov_b32_e32 v51, v7 -; VI-NEXT: v_mov_b32_e32 v48, v29 +; VI-NEXT: v_mov_b32_e32 v48, v28 ; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: .LBB15_3: ; %Flow ; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v44, v47 -; VI-NEXT: v_mov_b32_e32 v47, v46 +; VI-NEXT: v_mov_b32_e32 v42, v45 +; VI-NEXT: v_mov_b32_e32 v45, v46 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_mov_b32_e32 v46, v49 ; VI-NEXT: s_cbranch_vccnz .LBB15_5 ; VI-NEXT: ; %bb.4: ; %cmp.true -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 @@ -19290,7 +19283,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 @@ -19299,8 +19292,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -19313,8 +19306,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -19389,29 +19382,29 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 @@ -19423,8 +19416,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 @@ -19436,8 +19429,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 @@ -19449,8 +19442,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 @@ -19461,8 +19454,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -19472,8 +19465,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 @@ -19484,8 +19477,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -19495,63 +19488,63 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -19561,54 +19554,57 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v45 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v58 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v42 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v63 -; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v41 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload @@ -19893,7 +19889,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:164 ; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:172 ; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:188 ; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:196 ; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:204 ; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:212 @@ -19901,11 +19897,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:228 ; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:236 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252 ; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:260 ; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:284 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:292 ; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:300 ; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308 @@ -19931,7 +19927,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill @@ -20106,7 +20102,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v37, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -20118,7 +20114,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v37, v57 ; GFX9-NEXT: v_mov_b32_e32 v57, v60 ; GFX9-NEXT: v_mov_b32_e32 v52, v56 -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_mov_b32_e32 v34, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -20127,14 +20123,14 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -20144,12 +20140,12 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v51, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -20203,7 +20199,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: .LBB15_2: ; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload @@ -20565,12 +20561,12 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v45 -; GFX9-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v40 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v44 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 @@ -20580,7 +20576,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v43 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v42 ; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -20589,7 +20585,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v42 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v36 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 @@ -23677,8 +23673,23 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI-NEXT: s_mov_b32 s72, s74 ; SI-NEXT: s_mov_b32 s73, s75 ; SI-NEXT: s_mov_b32 s74, s76 -; SI-NEXT: v_readlane_b32 s75, v21, 0 -; SI-NEXT: v_readlane_b32 s76, v21, 1 +; SI-NEXT: s_mov_b32 s75, s77 +; SI-NEXT: s_mov_b32 s76, s78 +; SI-NEXT: s_mov_b32 s77, s79 +; SI-NEXT: s_mov_b32 s78, s88 +; SI-NEXT: s_mov_b32 s79, s89 +; SI-NEXT: s_mov_b32 s88, s90 +; SI-NEXT: s_mov_b32 s89, s91 +; SI-NEXT: s_mov_b32 s90, s92 +; SI-NEXT: s_mov_b32 s91, s93 +; SI-NEXT: s_mov_b32 s92, s94 +; SI-NEXT: s_mov_b32 s93, s95 +; SI-NEXT: s_mov_b32 s94, s30 +; SI-NEXT: s_mov_b32 s95, s31 +; SI-NEXT: s_mov_b32 s30, s34 +; SI-NEXT: s_mov_b32 s31, s35 +; SI-NEXT: v_readlane_b32 s34, v21, 0 +; SI-NEXT: v_readlane_b32 s35, v21, 1 ; SI-NEXT: s_cbranch_vccnz .LBB17_5 ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 @@ -23740,22 +23751,22 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI-NEXT: s_lshl_b32 s62, s84, 16 ; SI-NEXT: s_and_b32 s73, s83, 0xffff0000 ; SI-NEXT: s_lshl_b32 s72, s83, 16 -; SI-NEXT: s_and_b32 s77, s82, 0xffff0000 +; SI-NEXT: s_and_b32 s75, s82, 0xffff0000 ; SI-NEXT: s_lshl_b32 s74, s82, 16 -; SI-NEXT: s_and_b32 s79, s81, 0xffff0000 -; SI-NEXT: s_lshl_b32 s78, s81, 16 -; SI-NEXT: s_and_b32 s89, s80, 0xffff0000 -; SI-NEXT: s_lshl_b32 s88, s80, 16 -; SI-NEXT: s_and_b32 s91, s71, 0xffff0000 -; SI-NEXT: s_lshl_b32 s90, s71, 16 -; SI-NEXT: s_and_b32 s93, s70, 0xffff0000 -; SI-NEXT: s_lshl_b32 s92, s70, 16 -; SI-NEXT: s_and_b32 s95, s29, 0xffff0000 -; SI-NEXT: s_lshl_b32 s94, s29, 16 -; SI-NEXT: s_and_b32 s31, s28, 0xffff0000 -; SI-NEXT: s_lshl_b32 s30, s28, 16 -; SI-NEXT: s_and_b32 s35, s27, 0xffff0000 -; SI-NEXT: s_lshl_b32 s34, s27, 16 +; SI-NEXT: s_and_b32 s77, s81, 0xffff0000 +; SI-NEXT: s_lshl_b32 s76, s81, 16 +; SI-NEXT: s_and_b32 s79, s80, 0xffff0000 +; SI-NEXT: s_lshl_b32 s78, s80, 16 +; SI-NEXT: s_and_b32 s89, s71, 0xffff0000 +; SI-NEXT: s_lshl_b32 s88, s71, 16 +; SI-NEXT: s_and_b32 s91, s70, 0xffff0000 +; SI-NEXT: s_lshl_b32 s90, s70, 16 +; SI-NEXT: s_and_b32 s93, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s92, s29, 16 +; SI-NEXT: s_and_b32 s95, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s94, s28, 16 +; SI-NEXT: s_and_b32 s31, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s30, s27, 16 ; SI-NEXT: s_and_b32 s37, s26, 0xffff0000 ; SI-NEXT: s_lshl_b32 s36, s26, 16 ; SI-NEXT: s_and_b32 s39, s25, 0xffff0000 @@ -23774,8 +23785,8 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI-NEXT: s_lshl_b32 s66, s19, 16 ; SI-NEXT: s_and_b32 s69, s18, 0xffff0000 ; SI-NEXT: s_lshl_b32 s68, s18, 16 -; SI-NEXT: s_and_b32 s76, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s75, s17, 16 +; SI-NEXT: s_and_b32 s35, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s34, s17, 16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v21, s6, 2 ; SI-NEXT: s_lshl_b32 s6, s16, 16 @@ -23789,9 +23800,9 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s75 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s34 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -23859,57 +23870,57 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s34 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s30 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s94 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s92 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s90 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s88 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s78 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 @@ -27349,10 +27360,10 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s18 ; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s22 ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill @@ -27381,9 +27392,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB19_4 @@ -27394,12 +27405,13 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v3, v3, v35, 16 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_mov_b32_e32 v59, v2 ; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 @@ -27409,10 +27421,11 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_mov_b32_e32 v47, v10 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16 -; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v44, v14 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_alignbit_b32 v1, v1, v33, 16 ; SI-NEXT: v_mov_b32_e32 v62, v38 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 @@ -27452,20 +27465,20 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 ; SI-NEXT: v_mov_b32_e32 v37, v34 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(2) ; SI-NEXT: v_mov_b32_e32 v35, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_mov_b32_e32 v43, v8 ; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v60, v9 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 ; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v8 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 @@ -27489,7 +27502,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v44, v14 +; SI-NEXT: v_mov_b32_e32 v33, v14 ; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) @@ -27512,7 +27525,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: .LBB19_2: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -27528,7 +27541,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57 @@ -27540,7 +27553,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36 @@ -27652,7 +27665,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -27675,7 +27688,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58 @@ -27690,7 +27703,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62 @@ -27776,16 +27789,16 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB19_4: ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v61, v53 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload @@ -27794,7 +27807,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_mov_b32_e32 v57, v11 ; SI-NEXT: v_mov_b32_e32 v47, v10 ; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v44, v14 ; SI-NEXT: v_mov_b32_e32 v62, v38 ; SI-NEXT: v_mov_b32_e32 v38, v39 ; SI-NEXT: v_mov_b32_e32 v39, v41 @@ -29995,36 +30008,96 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -30034,173 +30107,99 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v63 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v29 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v28 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v43, v3 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v32, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v63 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 @@ -30215,16 +30214,6 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 @@ -30232,28 +30221,37 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v62 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v29 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -30270,7 +30268,22 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: .LBB20_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_4 @@ -30278,87 +30291,78 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 -; SI-NEXT: v_mov_b32_e32 v50, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v31, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v26 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v55 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v62 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v42 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v63 -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v62 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 @@ -30368,39 +30372,43 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v42 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v44 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28 ; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v42 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v40, v5 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v4 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 @@ -30417,53 +30425,58 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_mov_b32_e32 v36, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v2 -; SI-NEXT: v_mov_b32_e32 v52, v29 -; SI-NEXT: v_mov_b32_e32 v48, v30 -; SI-NEXT: v_mov_b32_e32 v56, v28 -; SI-NEXT: v_mov_b32_e32 v34, v7 -; SI-NEXT: v_mov_b32_e32 v32, v6 -; SI-NEXT: v_mov_b32_e32 v46, v8 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v50, v30 +; SI-NEXT: v_mov_b32_e32 v55, v29 +; SI-NEXT: v_mov_b32_e32 v44, v8 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 ; SI-NEXT: .LBB20_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -30482,34 +30495,32 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 @@ -30518,25 +30529,25 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 @@ -30545,7 +30556,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 @@ -30554,7 +30565,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 @@ -30563,7 +30574,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 @@ -30572,7 +30583,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 @@ -30581,7 +30592,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 @@ -30591,8 +30602,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -30602,8 +30613,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -30613,8 +30624,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -30624,8 +30635,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -30635,8 +30646,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -30646,8 +30657,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -30657,8 +30668,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -30668,8 +30679,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -30679,8 +30690,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -30690,8 +30701,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -30701,8 +30712,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -30711,46 +30722,48 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -35972,24 +35985,24 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:68 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21 @@ -36010,23 +36023,23 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 ; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v36 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v32 +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill @@ -36039,45 +36052,46 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB27_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_or_b32_e32 v7, v0, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v9, v0, v50 +; SI-NEXT: v_or_b32_e32 v9, v0, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v10, v0, v43 +; SI-NEXT: v_or_b32_e32 v10, v0, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 -; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_or_b32_e32 v11, v0, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: v_or_b32_e32 v12, v0, v40 +; SI-NEXT: v_or_b32_e32 v12, v0, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_mov_b32_e32 v52, v57 -; SI-NEXT: v_mov_b32_e32 v57, v40 -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v13 +; SI-NEXT: v_mov_b32_e32 v36, v41 +; SI-NEXT: v_mov_b32_e32 v41, v13 ; SI-NEXT: v_or_b32_e32 v13, v0, v13 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 -; SI-NEXT: v_mov_b32_e32 v36, v41 -; SI-NEXT: v_mov_b32_e32 v41, v14 -; SI-NEXT: v_or_b32_e32 v14, v0, v48 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 ; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v48 -; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: v_mov_b32_e32 v50, v45 +; SI-NEXT: v_mov_b32_e32 v45, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_mov_b32_e32 v52, v57 +; SI-NEXT: v_mov_b32_e32 v57, v49 +; SI-NEXT: v_mov_b32_e32 v49, v40 +; SI-NEXT: v_mov_b32_e32 v40, v15 ; SI-NEXT: v_or_b32_e32 v15, v0, v15 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; SI-NEXT: v_mov_b32_e32 v38, v61 +; SI-NEXT: v_mov_b32_e32 v34, v58 +; SI-NEXT: v_mov_b32_e32 v58, v61 ; SI-NEXT: v_mov_b32_e32 v61, v56 ; SI-NEXT: v_mov_b32_e32 v56, v16 ; SI-NEXT: v_or_b32_e32 v16, v0, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_or_b32_e32 v17, v0, v17 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 ; SI-NEXT: s_waitcnt expcnt(0) @@ -36111,7 +36125,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 ; SI-NEXT: v_or_b32_e32 v26, v0, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 @@ -36122,7 +36136,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s9, s25, 16 ; SI-NEXT: v_mov_b32_e32 v33, v28 ; SI-NEXT: v_or_b32_e32 v28, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 @@ -36134,7 +36148,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s11, s29, 16 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v30, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 ; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_mov_b32_e32 v63, v2 ; SI-NEXT: v_mov_b32_e32 v32, v55 @@ -36142,9 +36156,9 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v55, v4 ; SI-NEXT: v_mov_b32_e32 v53, v6 ; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v59, v42 -; SI-NEXT: v_or_b32_e32 v31, v0, v34 +; SI-NEXT: v_mov_b32_e32 v59, v44 +; SI-NEXT: v_mov_b32_e32 v43, v42 +; SI-NEXT: v_or_b32_e32 v31, v0, v48 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -36154,12 +36168,13 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB27_3 ; SI-NEXT: .LBB27_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v32, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v38, v43 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 @@ -36203,42 +36218,42 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -36300,7 +36315,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -36316,12 +36331,12 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 @@ -36330,7 +36345,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 @@ -36365,26 +36380,26 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB27_4: -; SI-NEXT: v_mov_b32_e32 v38, v61 +; SI-NEXT: v_mov_b32_e32 v34, v58 ; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v58, v61 ; SI-NEXT: v_mov_b32_e32 v63, v2 ; SI-NEXT: v_mov_b32_e32 v55, v4 ; SI-NEXT: v_mov_b32_e32 v53, v6 ; SI-NEXT: v_mov_b32_e32 v52, v57 ; SI-NEXT: v_mov_b32_e32 v51, v50 ; SI-NEXT: v_mov_b32_e32 v61, v56 -; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v50, v45 ; SI-NEXT: v_mov_b32_e32 v36, v41 -; SI-NEXT: v_mov_b32_e32 v57, v40 -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v13 -; SI-NEXT: v_mov_b32_e32 v43, v48 -; SI-NEXT: v_mov_b32_e32 v48, v15 -; SI-NEXT: v_mov_b32_e32 v41, v14 +; SI-NEXT: v_mov_b32_e32 v41, v13 +; SI-NEXT: v_mov_b32_e32 v57, v49 +; SI-NEXT: v_mov_b32_e32 v49, v40 +; SI-NEXT: v_mov_b32_e32 v40, v15 +; SI-NEXT: v_mov_b32_e32 v45, v14 ; SI-NEXT: v_mov_b32_e32 v56, v16 ; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v59, v42 +; SI-NEXT: v_mov_b32_e32 v59, v44 +; SI-NEXT: v_mov_b32_e32 v43, v42 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v37, v20 ; SI-NEXT: v_mov_b32_e32 v39, v23 @@ -55498,7 +55513,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 ; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:8 @@ -55524,7 +55539,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 @@ -55533,11 +55548,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v5 -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v12, 8, v11 ; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v13 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 @@ -55558,7 +55573,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 @@ -55566,22 +55581,22 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 @@ -55604,23 +55619,24 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v38 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v9 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 @@ -55633,37 +55649,37 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v13 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v3 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 ; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v9 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:76 ; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 @@ -55671,23 +55687,23 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:100 ; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:108 ; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:124 ; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:140 ; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:156 ; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:164 ; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:188 ; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:212 ; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 ; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 ; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:252 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:260 ; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:268 ; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276 @@ -55700,57 +55716,57 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB39_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v2, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -55765,12 +55781,12 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -55799,9 +55815,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v29, v9 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload @@ -55824,15 +55838,15 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v50, v0 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -55842,18 +55856,18 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_mov_b32_e32 v59, v0 ; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v56, v0 -; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v39, v0 ; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -55861,7 +55875,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_mov_b32_e32 v38, v1 ; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v37, v0 @@ -55869,8 +55883,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v36, v0 ; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -55882,39 +55896,41 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v33, v0 -; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v1, v21, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v51, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v34, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v22, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v25, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v34, v22 ; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v43, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v30, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v32, v23 ; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v43, v49 -; VI-NEXT: v_or_b32_sdwa v0, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v32, v54 -; VI-NEXT: v_mov_b32_e32 v34, v26 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v51, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v43, v0 +; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v49, v1 -; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v54, v0 @@ -55924,28 +55940,26 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_mov_b32_e32 v46, v61 ; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v45, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v58, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v58, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v45, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v47, v45 ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v58, v44 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v48, v0 -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v63, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v48, v28 +; VI-NEXT: v_mov_b32_e32 v47, v58 +; VI-NEXT: v_mov_b32_e32 v45, v44 +; VI-NEXT: v_mov_b32_e32 v63, v42 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v42, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v63, v42 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -55961,8 +55975,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v57, v0 ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -55996,44 +56010,43 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB39_3 ; VI-NEXT: .LBB39_2: -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v32, v54 -; VI-NEXT: v_mov_b32_e32 v43, v49 +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v46, v61 -; VI-NEXT: v_mov_b32_e32 v47, v45 -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v34, v26 -; VI-NEXT: v_mov_b32_e32 v58, v44 +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v34, v22 +; VI-NEXT: v_mov_b32_e32 v32, v23 +; VI-NEXT: v_mov_b32_e32 v47, v58 +; VI-NEXT: v_mov_b32_e32 v45, v44 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_mov_b32_e32 v63, v42 ; VI-NEXT: v_mov_b32_e32 v51, v7 -; VI-NEXT: v_mov_b32_e32 v48, v29 +; VI-NEXT: v_mov_b32_e32 v48, v28 ; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: .LBB39_3: ; %Flow ; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v44, v47 -; VI-NEXT: v_mov_b32_e32 v47, v46 +; VI-NEXT: v_mov_b32_e32 v42, v45 +; VI-NEXT: v_mov_b32_e32 v45, v46 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_mov_b32_e32 v46, v49 ; VI-NEXT: s_cbranch_vccnz .LBB39_5 ; VI-NEXT: ; %bb.4: ; %cmp.true -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 @@ -56087,7 +56100,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 @@ -56096,8 +56109,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -56110,8 +56123,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -56186,29 +56199,29 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 @@ -56220,8 +56233,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 @@ -56233,8 +56246,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 @@ -56246,8 +56259,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 @@ -56258,8 +56271,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -56269,8 +56282,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 @@ -56281,8 +56294,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -56292,63 +56305,63 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -56358,54 +56371,57 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v45 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v58 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v42 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v63 -; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v41 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload @@ -56690,7 +56706,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:164 ; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:172 ; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:188 ; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:196 ; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:204 ; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:212 @@ -56698,11 +56714,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:228 ; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:236 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252 ; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:260 ; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:284 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:292 ; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:300 ; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308 @@ -56728,7 +56744,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill @@ -56903,7 +56919,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v37, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -56915,7 +56931,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v37, v57 ; GFX9-NEXT: v_mov_b32_e32 v57, v60 ; GFX9-NEXT: v_mov_b32_e32 v52, v56 -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_mov_b32_e32 v34, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -56924,14 +56940,14 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -56941,12 +56957,12 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v51, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -57000,7 +57016,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: .LBB39_2: ; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload @@ -57362,12 +57378,12 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v45 -; GFX9-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v40 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v44 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 @@ -57377,7 +57393,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v43 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v42 ; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -57386,7 +57402,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v42 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v36 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 @@ -64192,10 +64208,10 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s18 ; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s22 ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill @@ -64224,9 +64240,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB43_4 @@ -64237,12 +64253,13 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v3, v3, v35, 16 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_mov_b32_e32 v59, v2 ; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 @@ -64252,10 +64269,11 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: v_mov_b32_e32 v47, v10 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16 -; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v44, v14 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_alignbit_b32 v1, v1, v33, 16 ; SI-NEXT: v_mov_b32_e32 v62, v38 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 @@ -64295,20 +64313,20 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 ; SI-NEXT: v_mov_b32_e32 v37, v34 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(2) ; SI-NEXT: v_mov_b32_e32 v35, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_mov_b32_e32 v43, v8 ; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v60, v9 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 ; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v8 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 @@ -64332,7 +64350,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v44, v14 +; SI-NEXT: v_mov_b32_e32 v33, v14 ; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) @@ -64355,7 +64373,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: .LBB43_2: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -64371,7 +64389,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57 @@ -64383,7 +64401,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36 @@ -64495,7 +64513,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -64518,7 +64536,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58 @@ -64533,7 +64551,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62 @@ -64619,16 +64637,16 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB43_4: ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v61, v53 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload @@ -64637,7 +64655,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: v_mov_b32_e32 v57, v11 ; SI-NEXT: v_mov_b32_e32 v47, v10 ; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v44, v14 ; SI-NEXT: v_mov_b32_e32 v62, v38 ; SI-NEXT: v_mov_b32_e32 v38, v39 ; SI-NEXT: v_mov_b32_e32 v39, v41 @@ -66838,36 +66856,96 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -66877,173 +66955,99 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v63 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v29 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v28 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v43, v3 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v32, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v63 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 @@ -67058,16 +67062,6 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 @@ -67075,28 +67069,37 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v62 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v29 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -67113,7 +67116,22 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 @@ -67121,89 +67139,80 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 ; SI-NEXT: v_add_f32_e32 v33, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 -; SI-NEXT: v_mov_b32_e32 v50, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v23 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 ; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 ; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v26 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_f32_e32 v40, 1.0, v63 +; SI-NEXT: v_add_f32_e32 v42, 1.0, v62 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v42 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 ; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_add_f32_e32 v42, 1.0, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 -; SI-NEXT: v_add_f32_e32 v44, 1.0, v62 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 @@ -67211,39 +67220,40 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 ; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 ; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v42 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v44 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28 ; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v42 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v40, v5 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v4 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 @@ -67262,51 +67272,59 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v60, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_mov_b32_e32 v36, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v2 -; SI-NEXT: v_mov_b32_e32 v52, v29 -; SI-NEXT: v_mov_b32_e32 v48, v30 -; SI-NEXT: v_mov_b32_e32 v56, v28 -; SI-NEXT: v_mov_b32_e32 v34, v7 -; SI-NEXT: v_mov_b32_e32 v32, v6 -; SI-NEXT: v_mov_b32_e32 v46, v8 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v50, v30 +; SI-NEXT: v_mov_b32_e32 v55, v29 +; SI-NEXT: v_mov_b32_e32 v44, v8 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -67325,34 +67343,32 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 @@ -67361,25 +67377,25 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 @@ -67388,7 +67404,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 @@ -67397,7 +67413,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 @@ -67406,7 +67422,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 @@ -67415,7 +67431,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 @@ -67424,7 +67440,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 @@ -67434,8 +67450,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -67445,8 +67461,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -67456,8 +67472,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -67467,8 +67483,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -67478,8 +67494,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -67489,8 +67505,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -67500,8 +67516,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -67511,8 +67527,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -67522,8 +67538,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -67533,8 +67549,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -67544,8 +67560,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -67554,46 +67570,48 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -67826,12 +67844,12 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_lshr_b32 s4, s12, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 ; SI-NEXT: s_lshr_b32 s4, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 ; SI-NEXT: s_lshr_b32 s4, s14, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 ; SI-NEXT: s_lshr_b32 s4, s40, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 ; SI-NEXT: s_lshr_b32 s4, s41, 16 @@ -67887,8 +67905,8 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s11 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 ; SI-NEXT: v_cvt_f32_f16_e32 v22, s40 @@ -67911,67 +67929,83 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s19 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v63, s17 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s18, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e64 v2, s19, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_add_f32_e64 v20, s15, 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e64 v18, s27, 1.0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_add_f32_e64 v49, s8, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e64 v15, s12, 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e64 v1, s18, 1.0 ; SI-NEXT: v_add_f32_e64 v17, s13, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_add_f32_e64 v10, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s26, 1.0 ; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v17 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e64 v2, s19, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_add_f32_e64 v23, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v29, s45, 1.0 ; SI-NEXT: v_add_f32_e64 v22, s40, 1.0 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 ; SI-NEXT: v_add_f32_e64 v21, s28, 1.0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 ; SI-NEXT: v_add_f32_e64 v19, s14, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v21 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 ; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v56 ; SI-NEXT: v_add_f32_e64 v41, s6, 1.0 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 -; SI-NEXT: v_add_f32_e64 v6, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s22, 1.0 ; SI-NEXT: v_add_f32_e64 v14, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v18, s27, 1.0 -; SI-NEXT: v_add_f32_e64 v23, s29, 1.0 ; SI-NEXT: v_add_f32_e64 v27, s46, 1.0 -; SI-NEXT: v_add_f32_e64 v26, s42, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v23 -; SI-NEXT: v_add_f32_e64 v25, s47, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_add_f32_e64 v24, s41, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; SI-NEXT: v_add_f32_e64 v37, s10, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v37 ; SI-NEXT: v_add_f32_e64 v53, s7, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v57 -; SI-NEXT: v_add_f32_e64 v49, s8, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v57 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v53 ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -67979,17 +68013,16 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v53, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 ; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s21, 1.0 ; SI-NEXT: v_add_f32_e64 v28, s43, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v49 ; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v41 ; SI-NEXT: v_add_f32_e64 v45, s9, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v1 ; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v28 ; SI-NEXT: v_add_f32_e64 v34, s11, 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v45 @@ -67997,35 +68030,24 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 -; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s22, 1.0 ; SI-NEXT: v_add_f32_e64 v12, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v16, s26, 1.0 -; SI-NEXT: v_add_f32_e64 v29, s45, 1.0 +; SI-NEXT: v_add_f32_e64 v25, s47, 1.0 ; SI-NEXT: v_add_f32_e64 v30, s44, 1.0 -; SI-NEXT: v_add_f32_e64 v24, s41, 1.0 -; SI-NEXT: v_add_f32_e64 v20, s15, 1.0 -; SI-NEXT: v_add_f32_e64 v37, s10, 1.0 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; SI-NEXT: v_add_f32_e64 v26, s42, 1.0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v56, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v25 ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v37 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v17 ; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 @@ -68033,7 +68055,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 @@ -68046,19 +68068,19 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v2 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v57, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v58, v3 ; SI-NEXT: .LBB45_3: ; %end @@ -68070,7 +68092,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 @@ -68215,7 +68237,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -68232,7 +68254,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -68250,14 +68272,14 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -68314,7 +68336,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr58 @@ -68357,14 +68379,14 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr51 @@ -72768,24 +72790,24 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:68 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21 @@ -72806,23 +72828,23 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 ; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v36 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v32 +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill @@ -72835,45 +72857,46 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_or_b32_e32 v7, v0, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v9, v0, v50 +; SI-NEXT: v_or_b32_e32 v9, v0, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v10, v0, v43 +; SI-NEXT: v_or_b32_e32 v10, v0, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 -; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_or_b32_e32 v11, v0, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: v_or_b32_e32 v12, v0, v40 +; SI-NEXT: v_or_b32_e32 v12, v0, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_mov_b32_e32 v52, v57 -; SI-NEXT: v_mov_b32_e32 v57, v40 -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v13 +; SI-NEXT: v_mov_b32_e32 v36, v41 +; SI-NEXT: v_mov_b32_e32 v41, v13 ; SI-NEXT: v_or_b32_e32 v13, v0, v13 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 -; SI-NEXT: v_mov_b32_e32 v36, v41 -; SI-NEXT: v_mov_b32_e32 v41, v14 -; SI-NEXT: v_or_b32_e32 v14, v0, v48 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 ; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v48 -; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: v_mov_b32_e32 v50, v45 +; SI-NEXT: v_mov_b32_e32 v45, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_mov_b32_e32 v52, v57 +; SI-NEXT: v_mov_b32_e32 v57, v49 +; SI-NEXT: v_mov_b32_e32 v49, v40 +; SI-NEXT: v_mov_b32_e32 v40, v15 ; SI-NEXT: v_or_b32_e32 v15, v0, v15 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; SI-NEXT: v_mov_b32_e32 v38, v61 +; SI-NEXT: v_mov_b32_e32 v34, v58 +; SI-NEXT: v_mov_b32_e32 v58, v61 ; SI-NEXT: v_mov_b32_e32 v61, v56 ; SI-NEXT: v_mov_b32_e32 v56, v16 ; SI-NEXT: v_or_b32_e32 v16, v0, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_or_b32_e32 v17, v0, v17 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 ; SI-NEXT: s_waitcnt expcnt(0) @@ -72907,7 +72930,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 ; SI-NEXT: v_or_b32_e32 v26, v0, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 @@ -72918,7 +72941,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: s_lshl_b32 s9, s25, 16 ; SI-NEXT: v_mov_b32_e32 v33, v28 ; SI-NEXT: v_or_b32_e32 v28, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 @@ -72930,7 +72953,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: s_lshl_b32 s11, s29, 16 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v30, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 ; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_mov_b32_e32 v63, v2 ; SI-NEXT: v_mov_b32_e32 v32, v55 @@ -72938,9 +72961,9 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v55, v4 ; SI-NEXT: v_mov_b32_e32 v53, v6 ; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v59, v42 -; SI-NEXT: v_or_b32_e32 v31, v0, v34 +; SI-NEXT: v_mov_b32_e32 v59, v44 +; SI-NEXT: v_mov_b32_e32 v43, v42 +; SI-NEXT: v_or_b32_e32 v31, v0, v48 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -72950,12 +72973,13 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v32, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v38, v43 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 @@ -72999,42 +73023,42 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -73096,7 +73120,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -73112,12 +73136,12 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 @@ -73126,7 +73150,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 @@ -73161,26 +73185,26 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: -; SI-NEXT: v_mov_b32_e32 v38, v61 +; SI-NEXT: v_mov_b32_e32 v34, v58 ; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v58, v61 ; SI-NEXT: v_mov_b32_e32 v63, v2 ; SI-NEXT: v_mov_b32_e32 v55, v4 ; SI-NEXT: v_mov_b32_e32 v53, v6 ; SI-NEXT: v_mov_b32_e32 v52, v57 ; SI-NEXT: v_mov_b32_e32 v51, v50 ; SI-NEXT: v_mov_b32_e32 v61, v56 -; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v50, v45 ; SI-NEXT: v_mov_b32_e32 v36, v41 -; SI-NEXT: v_mov_b32_e32 v57, v40 -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v13 -; SI-NEXT: v_mov_b32_e32 v43, v48 -; SI-NEXT: v_mov_b32_e32 v48, v15 -; SI-NEXT: v_mov_b32_e32 v41, v14 +; SI-NEXT: v_mov_b32_e32 v41, v13 +; SI-NEXT: v_mov_b32_e32 v57, v49 +; SI-NEXT: v_mov_b32_e32 v49, v40 +; SI-NEXT: v_mov_b32_e32 v40, v15 +; SI-NEXT: v_mov_b32_e32 v45, v14 ; SI-NEXT: v_mov_b32_e32 v56, v16 ; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v59, v42 +; SI-NEXT: v_mov_b32_e32 v59, v44 +; SI-NEXT: v_mov_b32_e32 v43, v42 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v37, v20 ; SI-NEXT: v_mov_b32_e32 v39, v23 @@ -82115,7 +82139,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: v_readfirstlane_b32 s5, v14 ; GFX11-NEXT: v_writelane_b32 v16, s37, 5 ; GFX11-NEXT: v_writelane_b32 v17, s101, 5 -; GFX11-NEXT: s_mov_b32 s101, 0 +; GFX11-NEXT: s_mov_b32 s44, 0 ; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo ; GFX11-NEXT: ; implicit-def: $vgpr19 : SGPR spill to VGPR lane ; GFX11-NEXT: ; implicit-def: $vgpr18 : SGPR spill to VGPR lane @@ -82148,297 +82172,155 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: v_writelane_b32 v16, s85, 29 ; GFX11-NEXT: v_writelane_b32 v16, s86, 30 ; GFX11-NEXT: v_writelane_b32 v16, s87, 31 -; GFX11-NEXT: s_cbranch_scc0 .LBB57_2 +; GFX11-NEXT: s_cbranch_scc0 .LBB57_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s43, s25, 8 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 -; GFX11-NEXT: v_writelane_b32 v19, s43, 16 -; GFX11-NEXT: s_lshr_b32 s43, s24, 16 -; GFX11-NEXT: s_lshr_b32 s104, s5, 24 -; GFX11-NEXT: s_lshr_b32 s102, s5, 16 -; GFX11-NEXT: s_lshr_b32 s103, s5, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 17 -; GFX11-NEXT: s_lshr_b32 s43, s24, 8 -; GFX11-NEXT: s_lshr_b32 s57, s4, 16 -; GFX11-NEXT: s_lshr_b32 s47, s4, 8 -; GFX11-NEXT: s_lshr_b32 s46, s7, 24 -; GFX11-NEXT: v_writelane_b32 v19, s43, 18 -; GFX11-NEXT: s_lshr_b32 s43, s23, 24 -; GFX11-NEXT: s_lshr_b32 vcc_hi, s7, 16 -; GFX11-NEXT: s_lshr_b32 s34, s7, 8 -; GFX11-NEXT: s_lshr_b32 s69, s6, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 19 -; GFX11-NEXT: s_lshr_b32 s43, s23, 16 -; GFX11-NEXT: s_lshr_b32 s56, s6, 8 -; GFX11-NEXT: s_lshr_b32 s35, s9, 24 -; GFX11-NEXT: s_lshr_b32 s36, s9, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 20 -; GFX11-NEXT: s_lshr_b32 s43, s23, 8 -; GFX11-NEXT: s_lshr_b32 s37, s9, 8 -; GFX11-NEXT: s_lshr_b32 s38, s8, 16 -; GFX11-NEXT: s_lshr_b32 s39, s8, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 21 -; GFX11-NEXT: s_lshr_b32 s43, s22, 16 -; GFX11-NEXT: s_lshr_b32 s48, s11, 24 -; GFX11-NEXT: s_lshr_b32 s49, s11, 16 -; GFX11-NEXT: s_lshr_b32 s50, s11, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 22 -; GFX11-NEXT: s_lshr_b32 s43, s22, 8 -; GFX11-NEXT: s_lshr_b32 s51, s10, 16 -; GFX11-NEXT: s_lshr_b32 s52, s10, 8 -; GFX11-NEXT: s_lshr_b32 s53, s13, 24 -; GFX11-NEXT: v_writelane_b32 v19, s43, 23 -; GFX11-NEXT: s_lshr_b32 s43, s21, 24 -; GFX11-NEXT: s_lshr_b32 s54, s13, 16 -; GFX11-NEXT: s_lshr_b32 s55, s13, 8 -; GFX11-NEXT: s_lshr_b32 s64, s12, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 24 -; GFX11-NEXT: s_lshr_b32 s43, s21, 16 -; GFX11-NEXT: s_lshr_b32 s65, s12, 8 -; GFX11-NEXT: s_lshr_b32 s66, s15, 24 -; GFX11-NEXT: s_lshr_b32 s67, s15, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 25 -; GFX11-NEXT: s_lshr_b32 s43, s21, 8 -; GFX11-NEXT: s_lshr_b32 s68, s15, 8 -; GFX11-NEXT: s_lshr_b32 s59, s14, 16 -; GFX11-NEXT: s_lshr_b32 s58, s14, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 26 -; GFX11-NEXT: s_lshr_b32 s43, s20, 16 -; GFX11-NEXT: s_lshr_b32 s70, s41, 24 -; GFX11-NEXT: s_lshr_b32 s71, s41, 16 -; GFX11-NEXT: s_lshr_b32 s60, s41, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 27 -; GFX11-NEXT: s_lshr_b32 s43, s20, 8 -; GFX11-NEXT: s_lshr_b32 s80, s40, 16 -; GFX11-NEXT: s_lshr_b32 s61, s40, 8 -; GFX11-NEXT: s_lshr_b32 s81, s29, 24 -; GFX11-NEXT: v_writelane_b32 v19, s43, 28 -; GFX11-NEXT: s_lshr_b32 s43, s19, 24 -; GFX11-NEXT: s_lshr_b32 s82, s29, 16 -; GFX11-NEXT: s_lshr_b32 s83, s29, 8 -; GFX11-NEXT: s_lshr_b32 s84, s28, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 29 -; GFX11-NEXT: s_lshr_b32 s43, s19, 16 -; GFX11-NEXT: s_lshr_b32 s85, s28, 8 -; GFX11-NEXT: s_lshr_b32 s86, s27, 24 -; GFX11-NEXT: s_lshr_b32 s72, s27, 16 -; GFX11-NEXT: v_writelane_b32 v19, s43, 30 -; GFX11-NEXT: s_lshr_b32 s43, s19, 8 -; GFX11-NEXT: s_lshr_b32 s87, s27, 8 -; GFX11-NEXT: s_lshr_b32 s73, s26, 16 -; GFX11-NEXT: s_lshr_b32 s96, s26, 8 -; GFX11-NEXT: v_writelane_b32 v19, s43, 31 -; GFX11-NEXT: s_lshr_b32 s43, s18, 16 -; GFX11-NEXT: s_lshr_b32 s97, s25, 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 0 -; GFX11-NEXT: s_lshr_b32 s43, s18, 8 -; GFX11-NEXT: v_writelane_b32 v19, s62, 14 -; GFX11-NEXT: s_lshr_b32 s42, s25, 16 -; GFX11-NEXT: s_lshr_b32 s74, s2, 16 -; GFX11-NEXT: v_writelane_b32 v18, s43, 1 -; GFX11-NEXT: s_lshr_b32 s43, s17, 24 -; GFX11-NEXT: v_writelane_b32 v19, s63, 15 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[6:7], 24 +; GFX11-NEXT: s_lshr_b32 s42, s5, 24 +; GFX11-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; GFX11-NEXT: v_writelane_b32 v19, s42, 4 +; GFX11-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-NEXT: s_lshr_b32 s102, s29, 8 +; GFX11-NEXT: s_lshr_b32 s103, s28, 16 +; GFX11-NEXT: s_lshr_b32 s104, s28, 8 +; GFX11-NEXT: v_writelane_b32 v19, s42, 5 +; GFX11-NEXT: s_lshr_b32 s42, s5, 8 +; GFX11-NEXT: s_lshr_b32 vcc_hi, s27, 24 +; GFX11-NEXT: s_lshr_b32 s34, s27, 16 +; GFX11-NEXT: s_lshr_b32 s35, s27, 8 +; GFX11-NEXT: v_writelane_b32 v19, s42, 6 +; GFX11-NEXT: s_lshr_b32 s42, s4, 16 +; GFX11-NEXT: s_lshr_b32 s36, s26, 16 +; GFX11-NEXT: s_lshr_b32 s37, s26, 8 +; GFX11-NEXT: s_lshr_b32 s38, s25, 24 +; GFX11-NEXT: v_writelane_b32 v19, s42, 7 +; GFX11-NEXT: s_lshr_b32 s42, s4, 8 +; GFX11-NEXT: s_lshr_b32 s39, s25, 16 +; GFX11-NEXT: s_lshr_b32 s48, s25, 8 +; GFX11-NEXT: s_lshr_b32 s49, s24, 16 +; GFX11-NEXT: v_writelane_b32 v19, s42, 8 +; GFX11-NEXT: s_lshr_b32 s42, s7, 24 +; GFX11-NEXT: s_lshr_b32 s50, s24, 8 +; GFX11-NEXT: s_lshr_b32 s51, s23, 24 +; GFX11-NEXT: s_lshr_b32 s52, s23, 16 +; GFX11-NEXT: v_writelane_b32 v19, s42, 9 +; GFX11-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-NEXT: s_lshr_b32 s53, s23, 8 +; GFX11-NEXT: s_lshr_b32 s54, s22, 16 +; GFX11-NEXT: s_lshr_b32 s55, s22, 8 +; GFX11-NEXT: v_writelane_b32 v19, s42, 10 +; GFX11-NEXT: s_lshr_b32 s42, s7, 8 +; GFX11-NEXT: s_lshr_b32 s64, s21, 24 +; GFX11-NEXT: s_lshr_b32 s65, s21, 16 +; GFX11-NEXT: s_lshr_b32 s66, s21, 8 +; GFX11-NEXT: v_writelane_b32 v19, s42, 11 +; GFX11-NEXT: s_lshr_b32 s42, s6, 16 +; GFX11-NEXT: s_lshr_b32 s67, s20, 16 +; GFX11-NEXT: s_lshr_b32 s68, s20, 8 +; GFX11-NEXT: s_lshr_b32 s69, s19, 24 +; GFX11-NEXT: v_writelane_b32 v19, s42, 12 +; GFX11-NEXT: s_lshr_b32 s42, s6, 8 +; GFX11-NEXT: s_lshr_b32 s70, s19, 16 +; GFX11-NEXT: s_lshr_b32 s71, s19, 8 +; GFX11-NEXT: s_lshr_b32 s80, s18, 16 +; GFX11-NEXT: v_writelane_b32 v19, s42, 13 +; GFX11-NEXT: s_lshr_b32 s42, s9, 24 +; GFX11-NEXT: s_lshr_b32 s81, s18, 8 +; GFX11-NEXT: s_lshr_b32 s82, s17, 24 +; GFX11-NEXT: s_lshr_b32 s83, s17, 16 +; GFX11-NEXT: v_writelane_b32 v19, s42, 14 +; GFX11-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-NEXT: s_lshr_b32 s84, s17, 8 +; GFX11-NEXT: s_lshr_b32 s85, s16, 16 +; GFX11-NEXT: s_lshr_b32 s86, s16, 8 +; GFX11-NEXT: v_writelane_b32 v19, s42, 15 +; GFX11-NEXT: s_lshr_b32 s42, s9, 8 +; GFX11-NEXT: s_lshr_b32 s87, s3, 24 +; GFX11-NEXT: s_lshr_b32 s96, s3, 16 +; GFX11-NEXT: s_lshr_b32 s97, s3, 8 +; GFX11-NEXT: v_writelane_b32 v19, s42, 16 +; GFX11-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-NEXT: s_lshr_b32 s43, s2, 8 ; GFX11-NEXT: s_lshr_b32 s98, s1, 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 2 -; GFX11-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-NEXT: v_writelane_b32 v19, s62, 12 ; GFX11-NEXT: s_lshr_b32 s99, s1, 16 +; GFX11-NEXT: v_writelane_b32 v19, s42, 17 +; GFX11-NEXT: s_lshr_b32 s42, s8, 8 ; GFX11-NEXT: s_lshr_b32 s100, s1, 8 -; GFX11-NEXT: v_writelane_b32 v18, s43, 3 -; GFX11-NEXT: s_lshr_b32 s43, s17, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 13 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[8:9], 24 -; GFX11-NEXT: s_lshr_b32 s44, s0, 16 -; GFX11-NEXT: v_writelane_b32 v18, s43, 4 -; GFX11-NEXT: s_lshr_b32 s43, s16, 16 -; GFX11-NEXT: v_writelane_b32 v19, s62, 10 -; GFX11-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-NEXT: s_lshr_b32 s101, s0, 8 +; GFX11-NEXT: v_writelane_b32 v19, s42, 18 +; GFX11-NEXT: s_lshr_b32 s42, s11, 24 +; GFX11-NEXT: s_lshr_b64 s[56:57], s[10:11], 24 +; GFX11-NEXT: s_lshr_b64 s[58:59], s[12:13], 24 +; GFX11-NEXT: s_lshr_b64 s[60:61], s[14:15], 24 +; GFX11-NEXT: v_writelane_b32 v19, s42, 19 +; GFX11-NEXT: s_lshr_b32 s42, s11, 16 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 +; GFX11-NEXT: s_lshr_b64 s[72:73], s[28:29], 24 ; GFX11-NEXT: s_lshr_b64 s[76:77], s[26:27], 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 5 -; GFX11-NEXT: s_lshr_b32 s43, s16, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 11 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 +; GFX11-NEXT: v_writelane_b32 v19, s42, 20 +; GFX11-NEXT: s_lshr_b32 s42, s11, 8 ; GFX11-NEXT: s_lshr_b64 s[88:89], s[24:25], 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 6 -; GFX11-NEXT: s_lshr_b32 s43, s3, 24 -; GFX11-NEXT: v_writelane_b32 v19, s62, 8 +; GFX11-NEXT: s_lshr_b64 s[74:75], s[22:23], 24 ; GFX11-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; GFX11-NEXT: v_writelane_b32 v19, s42, 21 +; GFX11-NEXT: s_lshr_b32 s42, s10, 16 ; GFX11-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 7 -; GFX11-NEXT: s_lshr_b32 s43, s3, 16 -; GFX11-NEXT: v_writelane_b32 v19, s63, 9 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 ; GFX11-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 8 -; GFX11-NEXT: s_lshr_b32 s43, s3, 8 -; GFX11-NEXT: v_writelane_b32 v19, s62, 6 ; GFX11-NEXT: s_lshr_b64 s[94:95], s[2:3], 24 +; GFX11-NEXT: v_writelane_b32 v19, s42, 22 +; GFX11-NEXT: s_lshr_b32 s42, s10, 8 ; GFX11-NEXT: s_lshr_b64 s[30:31], s[0:1], 24 -; GFX11-NEXT: v_writelane_b32 v18, s43, 9 -; GFX11-NEXT: s_lshr_b32 s43, s2, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 7 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v19, s62, 4 -; GFX11-NEXT: v_writelane_b32 v19, s63, 5 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 -; GFX11-NEXT: v_writelane_b32 v19, s62, 2 -; GFX11-NEXT: v_writelane_b32 v19, s63, 3 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[28:29], 24 +; GFX11-NEXT: v_writelane_b32 v19, s42, 23 +; GFX11-NEXT: s_lshr_b32 s42, s13, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v19, s42, 24 +; GFX11-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-NEXT: v_writelane_b32 v19, s42, 25 +; GFX11-NEXT: s_lshr_b32 s42, s13, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v19, s42, 26 +; GFX11-NEXT: s_lshr_b32 s42, s12, 16 +; GFX11-NEXT: v_writelane_b32 v19, s42, 27 +; GFX11-NEXT: s_lshr_b32 s42, s12, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v19, s42, 28 +; GFX11-NEXT: s_lshr_b32 s42, s15, 24 +; GFX11-NEXT: v_writelane_b32 v19, s42, 29 +; GFX11-NEXT: s_lshr_b32 s42, s15, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v19, s42, 30 +; GFX11-NEXT: s_lshr_b32 s42, s15, 8 +; GFX11-NEXT: v_writelane_b32 v19, s42, 31 +; GFX11-NEXT: s_lshr_b32 s42, s14, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v19, s62, 0 -; GFX11-NEXT: v_writelane_b32 v19, s63, 1 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 -; GFX11-NEXT: s_branch .LBB57_3 -; GFX11-NEXT: .LBB57_2: -; GFX11-NEXT: ; implicit-def: $vcc_hi -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: s_mov_b32 s101, -1 -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 0 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 1 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 2 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 3 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 4 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 5 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: ; implicit-def: $sgpr45 -; GFX11-NEXT: ; implicit-def: $sgpr44 -; GFX11-NEXT: ; implicit-def: $sgpr30 -; GFX11-NEXT: ; implicit-def: $sgpr100 -; GFX11-NEXT: ; implicit-def: $sgpr99 -; GFX11-NEXT: ; implicit-def: $sgpr98 -; GFX11-NEXT: ; implicit-def: $sgpr43 -; GFX11-NEXT: ; implicit-def: $sgpr74 -; GFX11-NEXT: ; implicit-def: $sgpr94 -; GFX11-NEXT: ; implicit-def: $sgpr92 -; GFX11-NEXT: ; implicit-def: $sgpr90 -; GFX11-NEXT: ; implicit-def: $sgpr78 -; GFX11-NEXT: ; implicit-def: $sgpr62 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr97 -; GFX11-NEXT: ; implicit-def: $sgpr96 -; GFX11-NEXT: ; implicit-def: $sgpr73 -; GFX11-NEXT: ; implicit-def: $sgpr87 -; GFX11-NEXT: ; implicit-def: $sgpr72 -; GFX11-NEXT: ; implicit-def: $sgpr86 -; GFX11-NEXT: ; implicit-def: $sgpr85 -; GFX11-NEXT: ; implicit-def: $sgpr84 -; GFX11-NEXT: ; implicit-def: $sgpr83 -; GFX11-NEXT: ; implicit-def: $sgpr82 -; GFX11-NEXT: ; implicit-def: $sgpr81 -; GFX11-NEXT: ; implicit-def: $sgpr61 -; GFX11-NEXT: ; implicit-def: $sgpr80 -; GFX11-NEXT: ; implicit-def: $sgpr60 -; GFX11-NEXT: ; implicit-def: $sgpr71 -; GFX11-NEXT: ; implicit-def: $sgpr70 -; GFX11-NEXT: ; implicit-def: $sgpr58 -; GFX11-NEXT: ; implicit-def: $sgpr59 -; GFX11-NEXT: ; implicit-def: $sgpr68 -; GFX11-NEXT: ; implicit-def: $sgpr67 -; GFX11-NEXT: ; implicit-def: $sgpr66 -; GFX11-NEXT: ; implicit-def: $sgpr65 -; GFX11-NEXT: ; implicit-def: $sgpr64 -; GFX11-NEXT: ; implicit-def: $sgpr55 -; GFX11-NEXT: ; implicit-def: $sgpr54 -; GFX11-NEXT: ; implicit-def: $sgpr53 -; GFX11-NEXT: ; implicit-def: $sgpr52 -; GFX11-NEXT: ; implicit-def: $sgpr51 -; GFX11-NEXT: ; implicit-def: $sgpr50 -; GFX11-NEXT: ; implicit-def: $sgpr49 -; GFX11-NEXT: ; implicit-def: $sgpr48 -; GFX11-NEXT: ; implicit-def: $sgpr39 -; GFX11-NEXT: ; implicit-def: $sgpr38 -; GFX11-NEXT: ; implicit-def: $sgpr37 -; GFX11-NEXT: ; implicit-def: $sgpr36 -; GFX11-NEXT: ; implicit-def: $sgpr35 -; GFX11-NEXT: ; implicit-def: $sgpr56 -; GFX11-NEXT: ; implicit-def: $sgpr69 -; GFX11-NEXT: ; implicit-def: $sgpr34 -; GFX11-NEXT: ; implicit-def: $sgpr46 -; GFX11-NEXT: ; implicit-def: $sgpr47 -; GFX11-NEXT: ; implicit-def: $sgpr57 -; GFX11-NEXT: ; implicit-def: $sgpr103 -; GFX11-NEXT: ; implicit-def: $sgpr102 -; GFX11-NEXT: ; implicit-def: $sgpr104 -; GFX11-NEXT: ; implicit-def: $sgpr88 -; GFX11-NEXT: ; implicit-def: $sgpr76 -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 6 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 7 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 8 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 9 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 10 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 11 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 12 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 13 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 14 -; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 15 -; GFX11-NEXT: .LBB57_3: ; %Flow -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s101 -; GFX11-NEXT: s_mov_b32 s101, s104 -; GFX11-NEXT: s_mov_b32 s104, s57 -; GFX11-NEXT: s_mov_b32 s57, s69 -; GFX11-NEXT: s_mov_b32 s69, s42 -; GFX11-NEXT: s_cbranch_vccnz .LBB57_5 -; GFX11-NEXT: ; %bb.4: ; %cmp.true +; GFX11-NEXT: v_writelane_b32 v18, s42, 0 +; GFX11-NEXT: s_lshr_b32 s42, s14, 8 +; GFX11-NEXT: v_writelane_b32 v19, s46, 2 +; GFX11-NEXT: v_writelane_b32 v18, s42, 1 +; GFX11-NEXT: s_lshr_b32 s42, s41, 24 +; GFX11-NEXT: v_writelane_b32 v19, s47, 3 +; GFX11-NEXT: s_lshr_b64 s[46:47], s[6:7], 24 +; GFX11-NEXT: v_writelane_b32 v18, s42, 2 +; GFX11-NEXT: s_lshr_b32 s42, s41, 16 +; GFX11-NEXT: v_writelane_b32 v19, s46, 0 +; GFX11-NEXT: v_writelane_b32 v18, s42, 3 +; GFX11-NEXT: s_lshr_b32 s42, s41, 8 +; GFX11-NEXT: v_writelane_b32 v19, s47, 1 +; GFX11-NEXT: s_lshr_b64 s[46:47], s[8:9], 24 +; GFX11-NEXT: v_writelane_b32 v18, s42, 4 +; GFX11-NEXT: s_lshr_b32 s42, s40, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v18, s42, 5 +; GFX11-NEXT: s_lshr_b32 s42, s40, 8 +; GFX11-NEXT: v_writelane_b32 v18, s42, 6 +; GFX11-NEXT: s_lshr_b32 s42, s29, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v18, s42, 7 +; GFX11-NEXT: s_lshr_b32 s42, s29, 16 +; GFX11-NEXT: v_writelane_b32 v18, s42, 8 +; GFX11-NEXT: s_lshr_b32 s42, s2, 16 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s44 +; GFX11-NEXT: s_cbranch_vccnz .LBB57_3 +; GFX11-NEXT: .LBB57_2: ; %cmp.true ; GFX11-NEXT: s_add_u32 s0, s0, 3 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 ; GFX11-NEXT: s_add_u32 s2, s2, 3 @@ -82471,355 +82353,321 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: s_addc_u32 s7, s7, 0 ; GFX11-NEXT: s_add_u32 s4, s4, 3 ; GFX11-NEXT: s_addc_u32 s5, s5, 0 -; GFX11-NEXT: s_lshr_b32 s42, s25, 8 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 +; GFX11-NEXT: s_lshr_b32 s102, s29, 8 +; GFX11-NEXT: s_lshr_b32 s42, s5, 24 +; GFX11-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; GFX11-NEXT: v_writelane_b32 v19, s42, 4 +; GFX11-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-NEXT: s_lshr_b32 s103, s28, 16 +; GFX11-NEXT: s_lshr_b32 s104, s28, 8 +; GFX11-NEXT: s_lshr_b32 vcc_hi, s27, 24 +; GFX11-NEXT: v_writelane_b32 v19, s42, 5 +; GFX11-NEXT: s_lshr_b32 s42, s5, 8 +; GFX11-NEXT: s_lshr_b32 s34, s27, 16 +; GFX11-NEXT: s_lshr_b32 s35, s27, 8 +; GFX11-NEXT: s_lshr_b32 s36, s26, 16 +; GFX11-NEXT: v_writelane_b32 v19, s42, 6 +; GFX11-NEXT: s_lshr_b32 s42, s4, 16 +; GFX11-NEXT: s_lshr_b32 s37, s26, 8 +; GFX11-NEXT: s_lshr_b32 s38, s25, 24 +; GFX11-NEXT: s_lshr_b32 s39, s25, 16 +; GFX11-NEXT: v_writelane_b32 v19, s42, 7 +; GFX11-NEXT: s_lshr_b32 s42, s4, 8 +; GFX11-NEXT: s_lshr_b32 s48, s25, 8 +; GFX11-NEXT: s_lshr_b32 s49, s24, 16 +; GFX11-NEXT: s_lshr_b32 s50, s24, 8 +; GFX11-NEXT: v_writelane_b32 v19, s42, 8 +; GFX11-NEXT: s_lshr_b32 s42, s7, 24 +; GFX11-NEXT: s_lshr_b32 s51, s23, 24 +; GFX11-NEXT: s_lshr_b32 s52, s23, 16 +; GFX11-NEXT: s_lshr_b32 s53, s23, 8 +; GFX11-NEXT: v_writelane_b32 v19, s42, 9 +; GFX11-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-NEXT: s_lshr_b32 s54, s22, 16 +; GFX11-NEXT: s_lshr_b32 s55, s22, 8 +; GFX11-NEXT: s_lshr_b32 s64, s21, 24 +; GFX11-NEXT: v_writelane_b32 v19, s42, 10 +; GFX11-NEXT: s_lshr_b32 s42, s7, 8 +; GFX11-NEXT: s_lshr_b32 s65, s21, 16 +; GFX11-NEXT: s_lshr_b32 s66, s21, 8 +; GFX11-NEXT: s_lshr_b32 s67, s20, 16 +; GFX11-NEXT: v_writelane_b32 v19, s42, 11 +; GFX11-NEXT: s_lshr_b32 s42, s6, 16 +; GFX11-NEXT: s_lshr_b32 s68, s20, 8 +; GFX11-NEXT: s_lshr_b32 s69, s19, 24 +; GFX11-NEXT: s_lshr_b32 s70, s19, 16 +; GFX11-NEXT: v_writelane_b32 v19, s42, 12 +; GFX11-NEXT: s_lshr_b32 s42, s6, 8 +; GFX11-NEXT: s_lshr_b32 s71, s19, 8 +; GFX11-NEXT: s_lshr_b32 s80, s18, 16 +; GFX11-NEXT: s_lshr_b32 s81, s18, 8 +; GFX11-NEXT: v_writelane_b32 v19, s42, 13 +; GFX11-NEXT: s_lshr_b32 s42, s9, 24 +; GFX11-NEXT: s_lshr_b32 s82, s17, 24 +; GFX11-NEXT: s_lshr_b32 s83, s17, 16 +; GFX11-NEXT: s_lshr_b32 s84, s17, 8 +; GFX11-NEXT: v_writelane_b32 v19, s42, 14 +; GFX11-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-NEXT: s_lshr_b32 s85, s16, 16 +; GFX11-NEXT: s_lshr_b32 s86, s16, 8 +; GFX11-NEXT: s_lshr_b32 s87, s3, 24 +; GFX11-NEXT: v_writelane_b32 v19, s42, 15 +; GFX11-NEXT: s_lshr_b32 s42, s9, 8 +; GFX11-NEXT: s_lshr_b32 s96, s3, 16 +; GFX11-NEXT: s_lshr_b32 s97, s3, 8 +; GFX11-NEXT: s_lshr_b32 s43, s2, 8 ; GFX11-NEXT: v_writelane_b32 v19, s42, 16 -; GFX11-NEXT: s_lshr_b32 s42, s24, 16 -; GFX11-NEXT: s_lshr_b32 s101, s5, 24 -; GFX11-NEXT: s_lshr_b32 s102, s5, 16 -; GFX11-NEXT: s_lshr_b32 s103, s5, 8 +; GFX11-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-NEXT: s_lshr_b32 s98, s1, 24 +; GFX11-NEXT: s_lshr_b32 s99, s1, 16 +; GFX11-NEXT: s_lshr_b32 s100, s1, 8 ; GFX11-NEXT: v_writelane_b32 v19, s42, 17 -; GFX11-NEXT: s_lshr_b32 s42, s24, 8 -; GFX11-NEXT: s_lshr_b32 s104, s4, 16 -; GFX11-NEXT: s_lshr_b32 s47, s4, 8 -; GFX11-NEXT: s_lshr_b32 s46, s7, 24 +; GFX11-NEXT: s_lshr_b32 s42, s8, 8 +; GFX11-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-NEXT: s_lshr_b32 s101, s0, 8 +; GFX11-NEXT: s_lshr_b64 s[56:57], s[10:11], 24 ; GFX11-NEXT: v_writelane_b32 v19, s42, 18 -; GFX11-NEXT: s_lshr_b32 s42, s23, 24 -; GFX11-NEXT: s_lshr_b32 vcc_hi, s7, 16 -; GFX11-NEXT: s_lshr_b32 s34, s7, 8 -; GFX11-NEXT: s_lshr_b32 s57, s6, 16 +; GFX11-NEXT: s_lshr_b32 s42, s11, 24 +; GFX11-NEXT: s_lshr_b64 s[58:59], s[12:13], 24 +; GFX11-NEXT: s_lshr_b64 s[60:61], s[14:15], 24 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 ; GFX11-NEXT: v_writelane_b32 v19, s42, 19 -; GFX11-NEXT: s_lshr_b32 s42, s23, 16 -; GFX11-NEXT: s_lshr_b32 s56, s6, 8 -; GFX11-NEXT: s_lshr_b32 s35, s9, 24 -; GFX11-NEXT: s_lshr_b32 s36, s9, 16 +; GFX11-NEXT: s_lshr_b32 s42, s11, 16 +; GFX11-NEXT: s_lshr_b64 s[72:73], s[28:29], 24 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[26:27], 24 +; GFX11-NEXT: s_lshr_b64 s[88:89], s[24:25], 24 ; GFX11-NEXT: v_writelane_b32 v19, s42, 20 -; GFX11-NEXT: s_lshr_b32 s42, s23, 8 -; GFX11-NEXT: s_lshr_b32 s37, s9, 8 -; GFX11-NEXT: s_lshr_b32 s38, s8, 16 -; GFX11-NEXT: s_lshr_b32 s39, s8, 8 +; GFX11-NEXT: s_lshr_b32 s42, s11, 8 +; GFX11-NEXT: s_lshr_b64 s[74:75], s[22:23], 24 +; GFX11-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; GFX11-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 ; GFX11-NEXT: v_writelane_b32 v19, s42, 21 -; GFX11-NEXT: s_lshr_b32 s42, s22, 16 -; GFX11-NEXT: s_lshr_b32 s48, s11, 24 -; GFX11-NEXT: s_lshr_b32 s49, s11, 16 -; GFX11-NEXT: s_lshr_b32 s50, s11, 8 +; GFX11-NEXT: s_lshr_b32 s42, s10, 16 +; GFX11-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 +; GFX11-NEXT: s_lshr_b64 s[94:95], s[2:3], 24 +; GFX11-NEXT: s_lshr_b64 s[30:31], s[0:1], 24 ; GFX11-NEXT: v_writelane_b32 v19, s42, 22 -; GFX11-NEXT: s_lshr_b32 s42, s22, 8 -; GFX11-NEXT: s_lshr_b32 s51, s10, 16 -; GFX11-NEXT: s_lshr_b32 s52, s10, 8 -; GFX11-NEXT: s_lshr_b32 s53, s13, 24 +; GFX11-NEXT: s_lshr_b32 s42, s10, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v19, s42, 23 -; GFX11-NEXT: s_lshr_b32 s42, s21, 24 -; GFX11-NEXT: s_lshr_b32 s54, s13, 16 -; GFX11-NEXT: s_lshr_b32 s55, s13, 8 -; GFX11-NEXT: s_lshr_b32 s64, s12, 16 +; GFX11-NEXT: s_lshr_b32 s42, s13, 24 ; GFX11-NEXT: v_writelane_b32 v19, s42, 24 -; GFX11-NEXT: s_lshr_b32 s42, s21, 16 -; GFX11-NEXT: s_lshr_b32 s65, s12, 8 -; GFX11-NEXT: s_lshr_b32 s66, s15, 24 -; GFX11-NEXT: s_lshr_b32 s67, s15, 16 +; GFX11-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v19, s42, 25 -; GFX11-NEXT: s_lshr_b32 s42, s21, 8 -; GFX11-NEXT: s_lshr_b32 s68, s15, 8 -; GFX11-NEXT: s_lshr_b32 s59, s14, 16 -; GFX11-NEXT: s_lshr_b32 s58, s14, 8 +; GFX11-NEXT: s_lshr_b32 s42, s13, 8 ; GFX11-NEXT: v_writelane_b32 v19, s42, 26 -; GFX11-NEXT: s_lshr_b32 s42, s20, 16 -; GFX11-NEXT: s_lshr_b32 s70, s41, 24 -; GFX11-NEXT: s_lshr_b32 s71, s41, 16 -; GFX11-NEXT: s_lshr_b32 s60, s41, 8 +; GFX11-NEXT: s_lshr_b32 s42, s12, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v19, s42, 27 -; GFX11-NEXT: s_lshr_b32 s42, s20, 8 -; GFX11-NEXT: s_lshr_b32 s80, s40, 16 -; GFX11-NEXT: s_lshr_b32 s61, s40, 8 -; GFX11-NEXT: s_lshr_b32 s81, s29, 24 +; GFX11-NEXT: s_lshr_b32 s42, s12, 8 ; GFX11-NEXT: v_writelane_b32 v19, s42, 28 -; GFX11-NEXT: s_lshr_b32 s42, s19, 24 -; GFX11-NEXT: s_lshr_b32 s82, s29, 16 -; GFX11-NEXT: s_lshr_b32 s83, s29, 8 -; GFX11-NEXT: s_lshr_b32 s84, s28, 16 +; GFX11-NEXT: s_lshr_b32 s42, s15, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v19, s42, 29 -; GFX11-NEXT: s_lshr_b32 s42, s19, 16 -; GFX11-NEXT: s_lshr_b32 s85, s28, 8 -; GFX11-NEXT: s_lshr_b32 s86, s27, 24 -; GFX11-NEXT: s_lshr_b32 s72, s27, 16 +; GFX11-NEXT: s_lshr_b32 s42, s15, 16 ; GFX11-NEXT: v_writelane_b32 v19, s42, 30 -; GFX11-NEXT: s_lshr_b32 s42, s19, 8 -; GFX11-NEXT: s_lshr_b32 s87, s27, 8 -; GFX11-NEXT: s_lshr_b32 s73, s26, 16 -; GFX11-NEXT: s_lshr_b32 s96, s26, 8 +; GFX11-NEXT: s_lshr_b32 s42, s15, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v19, s42, 31 -; GFX11-NEXT: s_lshr_b32 s42, s18, 16 -; GFX11-NEXT: s_lshr_b32 s97, s25, 24 +; GFX11-NEXT: s_lshr_b32 s42, s14, 16 ; GFX11-NEXT: v_writelane_b32 v18, s42, 0 -; GFX11-NEXT: s_lshr_b32 s42, s18, 8 -; GFX11-NEXT: v_writelane_b32 v19, s62, 14 -; GFX11-NEXT: s_lshr_b32 s69, s25, 16 -; GFX11-NEXT: s_lshr_b32 s74, s2, 16 +; GFX11-NEXT: s_lshr_b32 s42, s14, 8 +; GFX11-NEXT: v_writelane_b32 v19, s46, 2 ; GFX11-NEXT: v_writelane_b32 v18, s42, 1 -; GFX11-NEXT: s_lshr_b32 s42, s17, 24 -; GFX11-NEXT: v_writelane_b32 v19, s63, 15 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[6:7], 24 -; GFX11-NEXT: s_lshr_b32 s43, s2, 8 +; GFX11-NEXT: s_lshr_b32 s42, s41, 24 +; GFX11-NEXT: v_writelane_b32 v19, s47, 3 +; GFX11-NEXT: s_lshr_b64 s[46:47], s[6:7], 24 ; GFX11-NEXT: v_writelane_b32 v18, s42, 2 -; GFX11-NEXT: s_lshr_b32 s42, s17, 16 -; GFX11-NEXT: v_writelane_b32 v19, s62, 12 -; GFX11-NEXT: s_lshr_b32 s98, s1, 24 -; GFX11-NEXT: s_lshr_b32 s99, s1, 16 +; GFX11-NEXT: s_lshr_b32 s42, s41, 16 +; GFX11-NEXT: v_writelane_b32 v19, s46, 0 ; GFX11-NEXT: v_writelane_b32 v18, s42, 3 -; GFX11-NEXT: s_lshr_b32 s42, s17, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 13 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[8:9], 24 -; GFX11-NEXT: s_lshr_b32 s100, s1, 8 +; GFX11-NEXT: s_lshr_b32 s42, s41, 8 +; GFX11-NEXT: v_writelane_b32 v19, s47, 1 +; GFX11-NEXT: s_lshr_b64 s[46:47], s[8:9], 24 ; GFX11-NEXT: v_writelane_b32 v18, s42, 4 -; GFX11-NEXT: s_lshr_b32 s42, s16, 16 -; GFX11-NEXT: v_writelane_b32 v19, s62, 10 -; GFX11-NEXT: s_lshr_b32 s44, s0, 16 -; GFX11-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-NEXT: s_lshr_b32 s42, s40, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v18, s42, 5 -; GFX11-NEXT: s_lshr_b32 s42, s16, 8 -; GFX11-NEXT: v_writelane_b32 v19, s63, 11 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 -; GFX11-NEXT: s_lshr_b64 s[76:77], s[26:27], 24 +; GFX11-NEXT: s_lshr_b32 s42, s40, 8 ; GFX11-NEXT: v_writelane_b32 v18, s42, 6 -; GFX11-NEXT: s_lshr_b32 s42, s3, 24 -; GFX11-NEXT: v_writelane_b32 v19, s62, 8 -; GFX11-NEXT: s_lshr_b64 s[88:89], s[24:25], 24 -; GFX11-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; GFX11-NEXT: s_lshr_b32 s42, s29, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v18, s42, 7 -; GFX11-NEXT: s_lshr_b32 s42, s3, 16 -; GFX11-NEXT: v_writelane_b32 v19, s63, 9 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 -; GFX11-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 +; GFX11-NEXT: s_lshr_b32 s42, s29, 16 ; GFX11-NEXT: v_writelane_b32 v18, s42, 8 -; GFX11-NEXT: s_lshr_b32 s42, s3, 8 -; GFX11-NEXT: v_writelane_b32 v19, s62, 6 -; GFX11-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 -; GFX11-NEXT: s_lshr_b64 s[94:95], s[2:3], 24 -; GFX11-NEXT: s_lshr_b64 s[30:31], s[0:1], 24 -; GFX11-NEXT: v_writelane_b32 v18, s42, 9 -; GFX11-NEXT: v_writelane_b32 v19, s63, 7 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v19, s62, 4 -; GFX11-NEXT: v_writelane_b32 v19, s63, 5 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 -; GFX11-NEXT: v_writelane_b32 v19, s62, 2 -; GFX11-NEXT: v_writelane_b32 v19, s63, 3 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[28:29], 24 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v19, s62, 0 -; GFX11-NEXT: v_writelane_b32 v19, s63, 1 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 -; GFX11-NEXT: .LBB57_5: ; %end +; GFX11-NEXT: s_lshr_b32 s42, s2, 16 +; GFX11-NEXT: .LBB57_3: ; %end +; GFX11-NEXT: s_lshl_b32 s44, s101, 8 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_and_b32 s45, s45, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s44 +; GFX11-NEXT: s_lshl_b32 s44, s30, 8 ; GFX11-NEXT: s_lshl_b32 s43, s43, 8 +; GFX11-NEXT: s_or_b32 s44, s45, s44 ; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_and_b32 s42, s74, 0xff -; GFX11-NEXT: s_or_b32 s2, s2, s43 -; GFX11-NEXT: s_lshl_b32 s43, s94, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_or_b32 s42, s42, s43 -; GFX11-NEXT: s_lshl_b32 s45, s45, 8 -; GFX11-NEXT: s_lshl_b32 s42, s42, 16 -; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_or_b32 s2, s2, s42 -; GFX11-NEXT: v_readlane_b32 s42, v18, 9 -; GFX11-NEXT: s_or_b32 s0, s0, s45 -; GFX11-NEXT: s_lshl_b32 s45, s30, 8 -; GFX11-NEXT: s_and_b32 s44, s44, 0xff -; GFX11-NEXT: s_and_b32 s3, s3, 0xff -; GFX11-NEXT: s_or_b32 s44, s44, s45 -; GFX11-NEXT: s_lshl_b32 s42, s42, 8 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s44, s44, 16 -; GFX11-NEXT: s_or_b32 s3, s3, s42 -; GFX11-NEXT: v_readlane_b32 s42, v18, 8 -; GFX11-NEXT: v_readlane_b32 s43, v18, 7 +; GFX11-NEXT: s_or_b32 s2, s2, s43 +; GFX11-NEXT: s_lshl_b32 s43, s94, 8 +; GFX11-NEXT: s_and_b32 s42, s42, 0xff ; GFX11-NEXT: s_or_b32 s0, s0, s44 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_lshl_b32 s44, s100, 8 -; GFX11-NEXT: s_lshl_b32 s45, s98, 8 +; GFX11-NEXT: s_or_b32 s42, s42, s43 ; GFX11-NEXT: s_or_b32 s1, s1, s44 ; GFX11-NEXT: s_and_b32 s44, s99, 0xff -; GFX11-NEXT: s_and_b32 s42, s42, 0xff +; GFX11-NEXT: s_lshl_b32 s45, s98, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s42, s42, 16 ; GFX11-NEXT: s_or_b32 s44, s44, s45 -; GFX11-NEXT: s_lshl_b32 s43, s43, 8 +; GFX11-NEXT: s_or_b32 s2, s2, s42 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_lshl_b32 s42, s97, 8 ; GFX11-NEXT: s_and_b32 s1, s1, 0xffff ; GFX11-NEXT: s_lshl_b32 s44, s44, 16 -; GFX11-NEXT: s_or_b32 s42, s42, s43 +; GFX11-NEXT: s_or_b32 s3, s3, s42 +; GFX11-NEXT: s_and_b32 s42, s96, 0xff +; GFX11-NEXT: s_lshl_b32 s43, s87, 8 ; GFX11-NEXT: s_or_b32 s1, s1, s44 +; GFX11-NEXT: s_or_b32 s42, s42, s43 ; GFX11-NEXT: s_and_b32 s3, s3, 0xffff ; GFX11-NEXT: s_lshl_b32 s42, s42, 16 ; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 -; GFX11-NEXT: v_readlane_b32 s0, v18, 6 +; GFX11-NEXT: s_lshl_b32 s0, s86, 8 +; GFX11-NEXT: s_and_b32 s1, s16, 0xff ; GFX11-NEXT: s_or_b32 s3, s3, s42 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 -; GFX11-NEXT: v_readlane_b32 s2, v18, 5 -; GFX11-NEXT: s_lshl_b32 s0, s0, 8 -; GFX11-NEXT: s_and_b32 s1, s16, 0xff -; GFX11-NEXT: v_readlane_b32 s3, v18, 2 ; GFX11-NEXT: s_or_b32 s0, s1, s0 ; GFX11-NEXT: s_lshl_b32 s1, s92, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_and_b32 s2, s85, 0xff ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_or_b32 s1, s2, s1 -; GFX11-NEXT: v_readlane_b32 s2, v18, 4 +; GFX11-NEXT: s_lshl_b32 s2, s84, 8 ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_lshl_b32 s3, s82, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_and_b32 s1, s17, 0xff -; GFX11-NEXT: s_lshl_b32 s2, s2, 8 -; GFX11-NEXT: v_readlane_b32 s16, v18, 0 +; GFX11-NEXT: s_and_b32 s16, s80, 0xff ; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: v_readlane_b32 s2, v18, 3 +; GFX11-NEXT: s_and_b32 s2, s83, 0xff ; GFX11-NEXT: s_and_b32 s1, s1, 0xffff -; GFX11-NEXT: v_readlane_b32 s17, v19, 29 -; GFX11-NEXT: s_and_b32 s16, s16, 0xff -; GFX11-NEXT: v_readlane_b32 s100, v17, 4 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: v_readlane_b32 s99, v17, 3 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_and_b32 s3, s18, 0xff ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-NEXT: s_lshl_b32 s17, s69, 8 ; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: v_readlane_b32 s2, v18, 1 +; GFX11-NEXT: s_lshl_b32 s2, s81, 8 ; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 -; GFX11-NEXT: v_readlane_b32 s0, v19, 28 -; GFX11-NEXT: s_and_b32 s1, s20, 0xff -; GFX11-NEXT: s_lshl_b32 s2, s2, 8 -; GFX11-NEXT: v_readlane_b32 s18, v19, 19 ; GFX11-NEXT: s_or_b32 s2, s3, s2 ; GFX11-NEXT: s_lshl_b32 s3, s90, 8 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_or_b32 s3, s16, s3 -; GFX11-NEXT: v_readlane_b32 s16, v19, 31 +; GFX11-NEXT: s_lshl_b32 s16, s71, 8 ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_lshl_b32 s0, s0, 8 +; GFX11-NEXT: s_lshl_b32 s0, s68, 8 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_and_b32 s3, s19, 0xff -; GFX11-NEXT: s_lshl_b32 s16, s16, 8 -; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s20, 0xff ; GFX11-NEXT: s_or_b32 s3, s3, s16 -; GFX11-NEXT: v_readlane_b32 s16, v19, 30 +; GFX11-NEXT: s_and_b32 s16, s70, 0xff ; GFX11-NEXT: s_and_b32 s3, s3, 0xffff -; GFX11-NEXT: s_lshl_b32 s1, s78, 8 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-NEXT: s_lshl_b32 s18, s18, 8 -; GFX11-NEXT: s_and_b32 s16, s16, 0xff -; GFX11-NEXT: s_lshl_b32 s19, s86, 8 ; GFX11-NEXT: s_or_b32 s16, s16, s17 -; GFX11-NEXT: v_readlane_b32 s17, v19, 21 +; GFX11-NEXT: s_or_b32 s0, s1, s0 ; GFX11-NEXT: s_lshl_b32 s16, s16, 16 -; GFX11-NEXT: v_readlane_b32 s98, v17, 2 +; GFX11-NEXT: s_lshl_b32 s1, s78, 8 ; GFX11-NEXT: s_or_b32 s3, s3, s16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 -; GFX11-NEXT: v_readlane_b32 s2, v19, 27 -; GFX11-NEXT: v_readlane_b32 s3, v19, 24 -; GFX11-NEXT: v_readlane_b32 s16, v19, 22 -; GFX11-NEXT: s_lshl_b32 s17, s17, 8 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off -; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_and_b32 s2, s67, 0xff +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_or_b32 s1, s2, s1 -; GFX11-NEXT: v_readlane_b32 s2, v19, 26 +; GFX11-NEXT: s_lshl_b32 s2, s66, 8 ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s64, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_and_b32 s1, s21, 0xff -; GFX11-NEXT: s_lshl_b32 s2, s2, 8 -; GFX11-NEXT: v_readlane_b32 s86, v16, 30 +; GFX11-NEXT: s_and_b32 s16, s54, 0xff ; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: v_readlane_b32 s2, v19, 25 +; GFX11-NEXT: s_and_b32 s2, s65, 0xff ; GFX11-NEXT: s_and_b32 s1, s1, 0xffff -; GFX11-NEXT: v_readlane_b32 s31, v16, 1 -; GFX11-NEXT: v_readlane_b32 s30, v16, 0 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_and_b32 s3, s22, 0xff ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_lshl_b32 s17, s53, 8 ; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: v_readlane_b32 s2, v19, 23 -; GFX11-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 -; GFX11-NEXT: v_readlane_b32 s1, v19, 18 -; GFX11-NEXT: s_and_b32 s0, s24, 0xff -; GFX11-NEXT: s_lshl_b32 s2, s2, 8 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s2, s55, 8 +; GFX11-NEXT: s_lshl_b32 s18, s51, 8 ; GFX11-NEXT: s_or_b32 s2, s3, s2 -; GFX11-NEXT: s_lshl_b32 s3, s62, 8 +; GFX11-NEXT: s_lshl_b32 s3, s74, 8 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_or_b32 s3, s16, s3 ; GFX11-NEXT: s_and_b32 s16, s23, 0xff ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 ; GFX11-NEXT: s_or_b32 s16, s16, s17 -; GFX11-NEXT: v_readlane_b32 s17, v19, 20 +; GFX11-NEXT: s_and_b32 s17, s52, 0xff ; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_and_b32 s3, s16, 0xffff -; GFX11-NEXT: s_lshl_b32 s1, s1, 8 -; GFX11-NEXT: s_and_b32 s17, s17, 0xff -; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s17, s17, s18 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_and_b32 s3, s16, 0xffff ; GFX11-NEXT: s_lshl_b32 s16, s17, 16 -; GFX11-NEXT: s_lshl_b32 s17, s97, 8 +; GFX11-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 ; GFX11-NEXT: s_or_b32 s3, s3, s16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 -; GFX11-NEXT: v_readlane_b32 s2, v19, 17 +; GFX11-NEXT: s_and_b32 s0, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s50, 8 +; GFX11-NEXT: s_and_b32 s2, s49, 0xff ; GFX11-NEXT: s_lshl_b32 s3, s88, 8 -; GFX11-NEXT: s_and_b32 s16, s69, 0xff -; GFX11-NEXT: s_and_b32 s18, s72, 0xff -; GFX11-NEXT: v_readlane_b32 s97, v17, 1 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: v_readlane_b32 s69, v16, 21 +; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: v_readlane_b32 s3, v19, 16 ; GFX11-NEXT: s_and_b32 s2, s25, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_lshl_b32 s3, s48, 8 +; GFX11-NEXT: s_and_b32 s16, s39, 0xff +; GFX11-NEXT: s_lshl_b32 s17, s38, 8 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s16, s17 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_and_b32 s16, s73, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s26, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s96, 8 +; GFX11-NEXT: s_lshl_b32 s3, s37, 8 +; GFX11-NEXT: s_and_b32 s16, s36, 0xff ; GFX11-NEXT: s_lshl_b32 s17, s76, 8 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s16, s17 ; GFX11-NEXT: s_and_b32 s16, s27, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s87, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s17, s35, 8 +; GFX11-NEXT: s_and_b32 s18, s34, 0xff +; GFX11-NEXT: s_lshl_b32 s19, vcc_hi, 8 ; GFX11-NEXT: s_or_b32 s16, s16, s17 ; GFX11-NEXT: s_or_b32 s17, s18, s19 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 ; GFX11-NEXT: s_and_b32 s16, s16, 0xffff ; GFX11-NEXT: s_lshl_b32 s17, s17, 16 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: v_readlane_b32 s16, v19, 0 +; GFX11-NEXT: v_readlane_b32 s16, v18, 8 +; GFX11-NEXT: v_readlane_b32 s17, v18, 7 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 ; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 ; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 ; GFX11-NEXT: s_and_b32 s0, s28, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s85, 8 -; GFX11-NEXT: s_and_b32 s2, s84, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s16, 8 -; GFX11-NEXT: v_readlane_b32 s17, v19, 1 +; GFX11-NEXT: s_lshl_b32 s1, s104, 8 +; GFX11-NEXT: s_and_b32 s2, s103, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s72, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s29, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s83, 8 -; GFX11-NEXT: s_and_b32 s16, s82, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s81, 8 -; GFX11-NEXT: v_readlane_b32 s18, v19, 2 +; GFX11-NEXT: s_lshl_b32 s3, s102, 8 +; GFX11-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s17, s17, 8 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s16, s17 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff @@ -82828,147 +82676,176 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 +; GFX11-NEXT: v_readlane_b32 s3, v18, 6 +; GFX11-NEXT: v_readlane_b32 s16, v18, 5 ; GFX11-NEXT: s_and_b32 s2, s40, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s61, 8 -; GFX11-NEXT: s_and_b32 s16, s80, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s18, 8 -; GFX11-NEXT: v_readlane_b32 s19, v19, 3 +; GFX11-NEXT: s_lshl_b32 s17, s62, 8 +; GFX11-NEXT: v_readlane_b32 s18, v18, 3 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_and_b32 s16, s16, 0xff ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s16, s17 +; GFX11-NEXT: v_readlane_b32 s17, v18, 4 +; GFX11-NEXT: v_readlane_b32 s19, v18, 2 ; GFX11-NEXT: s_and_b32 s16, s41, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s60, 8 -; GFX11-NEXT: s_and_b32 s18, s71, 0xff -; GFX11-NEXT: s_lshl_b32 s19, s70, 8 -; GFX11-NEXT: s_or_b32 s16, s16, s17 -; GFX11-NEXT: s_or_b32 s17, s18, s19 +; GFX11-NEXT: s_and_b32 s18, s18, 0xff ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-NEXT: s_lshl_b32 s19, s19, 8 ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s16, s16, s17 +; GFX11-NEXT: s_or_b32 s17, s18, s19 ; GFX11-NEXT: s_and_b32 s16, s16, 0xffff ; GFX11-NEXT: s_lshl_b32 s17, s17, 16 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: v_readlane_b32 s16, v19, 4 ; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 ; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-NEXT: v_readlane_b32 s1, v18, 1 +; GFX11-NEXT: v_readlane_b32 s2, v18, 0 ; GFX11-NEXT: s_and_b32 s0, s14, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s58, 8 -; GFX11-NEXT: s_and_b32 s2, s59, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s16, 8 +; GFX11-NEXT: s_lshl_b32 s3, s60, 8 +; GFX11-NEXT: v_readlane_b32 s14, v19, 30 +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s15, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s68, 8 -; GFX11-NEXT: s_and_b32 s14, s67, 0xff -; GFX11-NEXT: s_lshl_b32 s15, s66, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s14, s15 -; GFX11-NEXT: v_readlane_b32 s14, v19, 6 +; GFX11-NEXT: v_readlane_b32 s3, v19, 31 +; GFX11-NEXT: v_readlane_b32 s15, v19, 29 +; GFX11-NEXT: s_and_b32 s14, s14, 0xff ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_lshl_b32 s15, s15, 8 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_or_b32 s3, s14, s15 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s12, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s65, 8 -; GFX11-NEXT: s_and_b32 s12, s64, 0xff -; GFX11-NEXT: s_lshl_b32 s14, s14, 8 -; GFX11-NEXT: v_readlane_b32 s15, v19, 7 +; GFX11-NEXT: v_readlane_b32 s3, v19, 28 +; GFX11-NEXT: v_readlane_b32 s12, v19, 27 +; GFX11-NEXT: s_lshl_b32 s14, s58, 8 +; GFX11-NEXT: v_readlane_b32 s15, v19, 24 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_and_b32 s12, s12, 0xff ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s12, s14 ; GFX11-NEXT: s_and_b32 s12, s13, 0xff -; GFX11-NEXT: s_lshl_b32 s13, s55, 8 -; GFX11-NEXT: s_and_b32 s14, s54, 0xff -; GFX11-NEXT: s_lshl_b32 s15, s53, 8 -; GFX11-NEXT: s_or_b32 s12, s12, s13 -; GFX11-NEXT: s_or_b32 s13, s14, s15 +; GFX11-NEXT: v_readlane_b32 s13, v19, 26 +; GFX11-NEXT: v_readlane_b32 s14, v19, 25 +; GFX11-NEXT: s_lshl_b32 s15, s15, 8 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_lshl_b32 s13, s13, 8 +; GFX11-NEXT: s_and_b32 s14, s14, 0xff +; GFX11-NEXT: s_or_b32 s12, s12, s13 +; GFX11-NEXT: s_or_b32 s13, s14, s15 ; GFX11-NEXT: s_and_b32 s12, s12, 0xffff ; GFX11-NEXT: s_lshl_b32 s13, s13, 16 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s12, s13 -; GFX11-NEXT: v_readlane_b32 s12, v19, 8 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 ; GFX11-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 ; GFX11-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-NEXT: v_readlane_b32 s1, v19, 23 +; GFX11-NEXT: v_readlane_b32 s2, v19, 22 ; GFX11-NEXT: s_and_b32 s0, s10, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s52, 8 -; GFX11-NEXT: s_and_b32 s2, s51, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s12, 8 +; GFX11-NEXT: s_lshl_b32 s3, s56, 8 +; GFX11-NEXT: v_readlane_b32 s10, v19, 20 +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s11, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s50, 8 -; GFX11-NEXT: s_and_b32 s10, s49, 0xff -; GFX11-NEXT: s_lshl_b32 s11, s48, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s10, s11 -; GFX11-NEXT: v_readlane_b32 s10, v19, 10 +; GFX11-NEXT: v_readlane_b32 s3, v19, 21 +; GFX11-NEXT: v_readlane_b32 s11, v19, 19 +; GFX11-NEXT: s_and_b32 s10, s10, 0xff ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_or_b32 s3, s10, s11 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s8, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s39, 8 -; GFX11-NEXT: s_and_b32 s8, s38, 0xff -; GFX11-NEXT: s_lshl_b32 s10, s10, 8 -; GFX11-NEXT: v_readlane_b32 s11, v19, 11 +; GFX11-NEXT: v_readlane_b32 s3, v19, 18 +; GFX11-NEXT: v_readlane_b32 s8, v19, 17 +; GFX11-NEXT: s_lshl_b32 s10, s46, 8 +; GFX11-NEXT: v_readlane_b32 s11, v19, 14 +; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_and_b32 s8, s8, 0xff ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s8, s10 ; GFX11-NEXT: s_and_b32 s8, s9, 0xff -; GFX11-NEXT: s_lshl_b32 s9, s37, 8 -; GFX11-NEXT: s_and_b32 s10, s36, 0xff -; GFX11-NEXT: s_lshl_b32 s11, s35, 8 -; GFX11-NEXT: s_or_b32 s8, s8, s9 -; GFX11-NEXT: s_or_b32 s9, s10, s11 +; GFX11-NEXT: v_readlane_b32 s9, v19, 16 +; GFX11-NEXT: v_readlane_b32 s10, v19, 15 +; GFX11-NEXT: s_lshl_b32 s11, s11, 8 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-NEXT: s_or_b32 s8, s8, s9 +; GFX11-NEXT: s_or_b32 s9, s10, s11 ; GFX11-NEXT: s_and_b32 s8, s8, 0xffff ; GFX11-NEXT: s_lshl_b32 s9, s9, 16 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s8, s9 -; GFX11-NEXT: v_readlane_b32 s8, v19, 12 -; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-NEXT: v_readlane_b32 s1, v19, 13 +; GFX11-NEXT: v_readlane_b32 s2, v19, 12 +; GFX11-NEXT: v_readlane_b32 s8, v19, 0 ; GFX11-NEXT: s_and_b32 s0, s6, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s56, 8 -; GFX11-NEXT: s_and_b32 s2, s57, 0xff +; GFX11-NEXT: v_readlane_b32 s6, v19, 10 +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff ; GFX11-NEXT: s_lshl_b32 s3, s8, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s7, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s34, 8 -; GFX11-NEXT: s_and_b32 s6, vcc_hi, 0xff -; GFX11-NEXT: s_lshl_b32 s7, s46, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s6, s7 -; GFX11-NEXT: v_readlane_b32 s6, v19, 14 +; GFX11-NEXT: v_readlane_b32 s3, v19, 11 +; GFX11-NEXT: v_readlane_b32 s7, v19, 9 +; GFX11-NEXT: s_and_b32 s6, s6, 0xff ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_or_b32 s3, s6, s7 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s4, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s47, 8 -; GFX11-NEXT: s_and_b32 s4, s104, 0xff +; GFX11-NEXT: v_readlane_b32 s3, v19, 8 +; GFX11-NEXT: v_readlane_b32 s4, v19, 7 +; GFX11-NEXT: v_readlane_b32 s6, v19, 2 +; GFX11-NEXT: v_readlane_b32 s7, v19, 3 +; GFX11-NEXT: v_readlane_b32 s7, v19, 4 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_and_b32 s4, s4, 0xff ; GFX11-NEXT: s_lshl_b32 s6, s6, 8 -; GFX11-NEXT: v_readlane_b32 s7, v19, 15 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s4, s6 ; GFX11-NEXT: s_and_b32 s4, s5, 0xff -; GFX11-NEXT: s_lshl_b32 s5, s103, 8 -; GFX11-NEXT: s_and_b32 s6, s102, 0xff -; GFX11-NEXT: s_lshl_b32 s7, s101, 8 -; GFX11-NEXT: s_or_b32 s4, s4, s5 -; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: v_readlane_b32 s5, v19, 6 +; GFX11-NEXT: v_readlane_b32 s6, v19, 5 +; GFX11-NEXT: s_lshl_b32 s7, s7, 8 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 ; GFX11-NEXT: s_and_b32 s4, s4, 0xffff ; GFX11-NEXT: s_lshl_b32 s5, s5, 16 ; GFX11-NEXT: s_or_b32 s2, s2, s3 @@ -82976,9 +82853,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:64 ; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 ; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 -; GFX11-NEXT: v_readlane_b32 s17, v19, 5 -; GFX11-NEXT: v_readlane_b32 s13, v19, 9 -; GFX11-NEXT: v_readlane_b32 s9, v19, 13 +; GFX11-NEXT: v_readlane_b32 s9, v19, 1 ; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:80 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:96 @@ -82987,8 +82862,13 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: v_readlane_b32 s103, v17, 7 ; GFX11-NEXT: v_readlane_b32 s102, v17, 6 ; GFX11-NEXT: v_readlane_b32 s101, v17, 5 +; GFX11-NEXT: v_readlane_b32 s100, v17, 4 +; GFX11-NEXT: v_readlane_b32 s99, v17, 3 +; GFX11-NEXT: v_readlane_b32 s98, v17, 2 +; GFX11-NEXT: v_readlane_b32 s97, v17, 1 ; GFX11-NEXT: v_readlane_b32 s96, v17, 0 ; GFX11-NEXT: v_readlane_b32 s87, v16, 31 +; GFX11-NEXT: v_readlane_b32 s86, v16, 30 ; GFX11-NEXT: v_readlane_b32 s85, v16, 29 ; GFX11-NEXT: v_readlane_b32 s84, v16, 28 ; GFX11-NEXT: v_readlane_b32 s83, v16, 27 @@ -82997,6 +82877,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: v_readlane_b32 s80, v16, 24 ; GFX11-NEXT: v_readlane_b32 s71, v16, 23 ; GFX11-NEXT: v_readlane_b32 s70, v16, 22 +; GFX11-NEXT: v_readlane_b32 s69, v16, 21 ; GFX11-NEXT: v_readlane_b32 s68, v16, 20 ; GFX11-NEXT: v_readlane_b32 s67, v16, 19 ; GFX11-NEXT: v_readlane_b32 s66, v16, 18 @@ -83016,6 +82897,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: v_readlane_b32 s36, v16, 4 ; GFX11-NEXT: v_readlane_b32 s35, v16, 3 ; GFX11-NEXT: v_readlane_b32 s34, v16, 2 +; GFX11-NEXT: v_readlane_b32 s31, v16, 1 +; GFX11-NEXT: v_readlane_b32 s30, v16, 0 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_load_b32 v16, off, s32 @@ -83025,6 +82908,145 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB57_4: +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $vcc_hi +; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 0 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr101 +; GFX11-NEXT: ; implicit-def: $sgpr45 +; GFX11-NEXT: ; implicit-def: $sgpr30 +; GFX11-NEXT: ; implicit-def: $sgpr100 +; GFX11-NEXT: ; implicit-def: $sgpr99 +; GFX11-NEXT: ; implicit-def: $sgpr98 +; GFX11-NEXT: ; implicit-def: $sgpr43 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr94 +; GFX11-NEXT: ; implicit-def: $sgpr97 +; GFX11-NEXT: ; implicit-def: $sgpr96 +; GFX11-NEXT: ; implicit-def: $sgpr87 +; GFX11-NEXT: ; implicit-def: $sgpr86 +; GFX11-NEXT: ; implicit-def: $sgpr85 +; GFX11-NEXT: ; implicit-def: $sgpr92 +; GFX11-NEXT: ; implicit-def: $sgpr84 +; GFX11-NEXT: ; implicit-def: $sgpr83 +; GFX11-NEXT: ; implicit-def: $sgpr82 +; GFX11-NEXT: ; implicit-def: $sgpr81 +; GFX11-NEXT: ; implicit-def: $sgpr80 +; GFX11-NEXT: ; implicit-def: $sgpr90 +; GFX11-NEXT: ; implicit-def: $sgpr71 +; GFX11-NEXT: ; implicit-def: $sgpr70 +; GFX11-NEXT: ; implicit-def: $sgpr69 +; GFX11-NEXT: ; implicit-def: $sgpr68 +; GFX11-NEXT: ; implicit-def: $sgpr67 +; GFX11-NEXT: ; implicit-def: $sgpr78 +; GFX11-NEXT: ; implicit-def: $sgpr66 +; GFX11-NEXT: ; implicit-def: $sgpr65 +; GFX11-NEXT: ; implicit-def: $sgpr64 +; GFX11-NEXT: ; implicit-def: $sgpr55 +; GFX11-NEXT: ; implicit-def: $sgpr54 +; GFX11-NEXT: ; implicit-def: $sgpr74 +; GFX11-NEXT: ; implicit-def: $sgpr53 +; GFX11-NEXT: ; implicit-def: $sgpr52 +; GFX11-NEXT: ; implicit-def: $sgpr51 +; GFX11-NEXT: ; implicit-def: $sgpr50 +; GFX11-NEXT: ; implicit-def: $sgpr49 +; GFX11-NEXT: ; implicit-def: $sgpr48 +; GFX11-NEXT: ; implicit-def: $sgpr39 +; GFX11-NEXT: ; implicit-def: $sgpr38 +; GFX11-NEXT: ; implicit-def: $sgpr37 +; GFX11-NEXT: ; implicit-def: $sgpr36 +; GFX11-NEXT: ; implicit-def: $sgpr35 +; GFX11-NEXT: ; implicit-def: $sgpr34 +; GFX11-NEXT: ; implicit-def: $sgpr104 +; GFX11-NEXT: ; implicit-def: $sgpr103 +; GFX11-NEXT: ; implicit-def: $sgpr102 +; GFX11-NEXT: ; implicit-def: $sgpr88 +; GFX11-NEXT: ; implicit-def: $sgpr76 +; GFX11-NEXT: ; implicit-def: $sgpr72 +; GFX11-NEXT: ; implicit-def: $sgpr62 +; GFX11-NEXT: ; implicit-def: $sgpr60 +; GFX11-NEXT: ; implicit-def: $sgpr58 +; GFX11-NEXT: ; implicit-def: $sgpr56 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 1 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: v_writelane_b32 v19, vcc_lo, 2 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: v_writelane_b32 v19, vcc_hi, 3 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; kill: killed $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: s_branch .LBB57_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -90348,7 +90370,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 ; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:8 @@ -90374,7 +90396,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 @@ -90383,11 +90405,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v5 -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v12, 8, v11 ; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v13 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 @@ -90408,7 +90430,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 @@ -90416,22 +90438,22 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 @@ -90454,23 +90476,24 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v38 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v9 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 @@ -90483,37 +90506,37 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v13 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v3 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 ; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v9 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:76 ; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 @@ -90521,23 +90544,23 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:100 ; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:108 ; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:124 ; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:140 ; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:156 ; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:164 ; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:188 ; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:212 ; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 ; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 ; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:252 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:260 ; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:268 ; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276 @@ -90550,57 +90573,57 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB59_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v2, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -90615,12 +90638,12 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -90649,9 +90672,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v29, v9 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload @@ -90674,15 +90695,15 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v50, v0 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -90692,18 +90713,18 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v59, v0 ; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v56, v0 -; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v39, v0 ; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -90711,7 +90732,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v38, v1 ; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v37, v0 @@ -90719,8 +90740,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v36, v0 ; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -90732,39 +90753,41 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v33, v0 -; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v1, v21, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v51, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v34, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v22, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v25, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v34, v22 ; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v43, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v30, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v32, v23 ; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v43, v49 -; VI-NEXT: v_or_b32_sdwa v0, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v32, v54 -; VI-NEXT: v_mov_b32_e32 v34, v26 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v51, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v43, v0 +; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v49, v1 -; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v54, v0 @@ -90774,28 +90797,26 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v46, v61 ; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v45, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v58, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v58, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v45, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v47, v45 ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v58, v44 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v48, v0 -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v63, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v48, v28 +; VI-NEXT: v_mov_b32_e32 v47, v58 +; VI-NEXT: v_mov_b32_e32 v45, v44 +; VI-NEXT: v_mov_b32_e32 v63, v42 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v42, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v63, v42 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -90811,8 +90832,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v57, v0 ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -90846,44 +90867,43 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB59_3 ; VI-NEXT: .LBB59_2: -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v32, v54 -; VI-NEXT: v_mov_b32_e32 v43, v49 +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v46, v61 -; VI-NEXT: v_mov_b32_e32 v47, v45 -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v34, v26 -; VI-NEXT: v_mov_b32_e32 v58, v44 +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v34, v22 +; VI-NEXT: v_mov_b32_e32 v32, v23 +; VI-NEXT: v_mov_b32_e32 v47, v58 +; VI-NEXT: v_mov_b32_e32 v45, v44 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_mov_b32_e32 v63, v42 ; VI-NEXT: v_mov_b32_e32 v51, v7 -; VI-NEXT: v_mov_b32_e32 v48, v29 +; VI-NEXT: v_mov_b32_e32 v48, v28 ; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: .LBB59_3: ; %Flow ; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v44, v47 -; VI-NEXT: v_mov_b32_e32 v47, v46 +; VI-NEXT: v_mov_b32_e32 v42, v45 +; VI-NEXT: v_mov_b32_e32 v45, v46 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_mov_b32_e32 v46, v49 ; VI-NEXT: s_cbranch_vccnz .LBB59_5 ; VI-NEXT: ; %bb.4: ; %cmp.true -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 @@ -90937,7 +90957,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 @@ -90946,8 +90966,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -90960,8 +90980,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -91036,29 +91056,29 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 @@ -91070,8 +91090,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 @@ -91083,8 +91103,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 @@ -91096,8 +91116,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 @@ -91108,8 +91128,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -91119,8 +91139,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 @@ -91131,8 +91151,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -91142,63 +91162,63 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -91208,54 +91228,57 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v45 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v58 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v42 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v63 -; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v41 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload @@ -91540,7 +91563,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:164 ; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:172 ; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:188 ; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:196 ; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:204 ; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:212 @@ -91548,11 +91571,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:228 ; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:236 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252 ; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:260 ; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:284 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:292 ; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:300 ; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308 @@ -91578,7 +91601,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill @@ -91753,7 +91776,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v37, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -91765,7 +91788,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v37, v57 ; GFX9-NEXT: v_mov_b32_e32 v57, v60 ; GFX9-NEXT: v_mov_b32_e32 v52, v56 -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_mov_b32_e32 v34, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -91774,14 +91797,14 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -91791,12 +91814,12 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v51, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -91850,7 +91873,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: .LBB59_2: ; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload @@ -92212,12 +92235,12 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v45 -; GFX9-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v40 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v44 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 @@ -92227,7 +92250,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v43 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v42 ; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -92236,7 +92259,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v42 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v36 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 @@ -98984,10 +99007,10 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s18 ; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s22 ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill @@ -99016,9 +99039,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB63_4 @@ -99029,12 +99052,13 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v3, v3, v35, 16 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_mov_b32_e32 v59, v2 ; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 @@ -99044,10 +99068,11 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_mov_b32_e32 v47, v10 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16 -; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v44, v14 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_alignbit_b32 v1, v1, v33, 16 ; SI-NEXT: v_mov_b32_e32 v62, v38 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 @@ -99087,20 +99112,20 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 ; SI-NEXT: v_mov_b32_e32 v37, v34 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(2) ; SI-NEXT: v_mov_b32_e32 v35, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_mov_b32_e32 v43, v8 ; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v60, v9 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 ; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v8 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 @@ -99124,7 +99149,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v44, v14 +; SI-NEXT: v_mov_b32_e32 v33, v14 ; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) @@ -99147,7 +99172,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: .LBB63_2: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -99163,7 +99188,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57 @@ -99175,7 +99200,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36 @@ -99287,7 +99312,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -99310,7 +99335,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58 @@ -99325,7 +99350,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62 @@ -99411,16 +99436,16 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB63_4: ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v61, v53 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload @@ -99429,7 +99454,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_mov_b32_e32 v57, v11 ; SI-NEXT: v_mov_b32_e32 v47, v10 ; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v44, v14 ; SI-NEXT: v_mov_b32_e32 v62, v38 ; SI-NEXT: v_mov_b32_e32 v38, v39 ; SI-NEXT: v_mov_b32_e32 v39, v41 @@ -101630,36 +101655,98 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -101667,175 +101754,104 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB64_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v63 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v28 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v56, v3 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v32, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v63 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v52, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v29 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 @@ -101848,18 +101864,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 @@ -101867,28 +101873,33 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v62 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v53, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -101905,7 +101916,22 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: .LBB64_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB64_4 @@ -101915,9 +101941,9 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 ; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc @@ -101928,114 +101954,111 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v48 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 -; SI-NEXT: v_mov_b32_e32 v38, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 ; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 ; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; SI-NEXT: v_addc_u32_e32 v44, vcc, 0, v62, vcc -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v42 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v44 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v41 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_mov_b32_e32 v55, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v63 +; SI-NEXT: v_addc_u32_e32 v42, vcc, 0, v62, vcc +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v53 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v53, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29 ; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 @@ -102053,51 +102076,59 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_mov_b32_e32 v36, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v34, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v6 -; SI-NEXT: v_mov_b32_e32 v50, v29 -; SI-NEXT: v_mov_b32_e32 v48, v30 -; SI-NEXT: v_mov_b32_e32 v46, v28 -; SI-NEXT: v_mov_b32_e32 v34, v8 -; SI-NEXT: v_mov_b32_e32 v32, v7 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v52, v30 +; SI-NEXT: v_mov_b32_e32 v44, v29 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 ; SI-NEXT: .LBB64_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -102123,63 +102154,59 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 @@ -102188,7 +102215,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 @@ -102197,7 +102224,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 @@ -102206,7 +102233,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 @@ -102215,7 +102242,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 @@ -102225,9 +102252,20 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -102238,7 +102276,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -102249,7 +102287,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -102260,7 +102298,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -102271,7 +102309,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -102282,7 +102320,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -102291,9 +102329,9 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -102303,8 +102341,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -102315,7 +102353,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -102326,7 +102364,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -102337,7 +102375,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -102347,8 +102385,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -102356,35 +102394,28 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -107636,24 +107667,24 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:68 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21 @@ -107674,23 +107705,23 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 ; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v36 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v32 +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill @@ -107703,45 +107734,46 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB71_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_or_b32_e32 v7, v0, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v9, v0, v50 +; SI-NEXT: v_or_b32_e32 v9, v0, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v10, v0, v43 +; SI-NEXT: v_or_b32_e32 v10, v0, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 -; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_or_b32_e32 v11, v0, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: v_or_b32_e32 v12, v0, v40 +; SI-NEXT: v_or_b32_e32 v12, v0, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_mov_b32_e32 v52, v57 -; SI-NEXT: v_mov_b32_e32 v57, v40 -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v13 +; SI-NEXT: v_mov_b32_e32 v36, v41 +; SI-NEXT: v_mov_b32_e32 v41, v13 ; SI-NEXT: v_or_b32_e32 v13, v0, v13 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 -; SI-NEXT: v_mov_b32_e32 v36, v41 -; SI-NEXT: v_mov_b32_e32 v41, v14 -; SI-NEXT: v_or_b32_e32 v14, v0, v48 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 ; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v48 -; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: v_mov_b32_e32 v50, v45 +; SI-NEXT: v_mov_b32_e32 v45, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_mov_b32_e32 v52, v57 +; SI-NEXT: v_mov_b32_e32 v57, v49 +; SI-NEXT: v_mov_b32_e32 v49, v40 +; SI-NEXT: v_mov_b32_e32 v40, v15 ; SI-NEXT: v_or_b32_e32 v15, v0, v15 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; SI-NEXT: v_mov_b32_e32 v38, v61 +; SI-NEXT: v_mov_b32_e32 v34, v58 +; SI-NEXT: v_mov_b32_e32 v58, v61 ; SI-NEXT: v_mov_b32_e32 v61, v56 ; SI-NEXT: v_mov_b32_e32 v56, v16 ; SI-NEXT: v_or_b32_e32 v16, v0, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_or_b32_e32 v17, v0, v17 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 ; SI-NEXT: s_waitcnt expcnt(0) @@ -107775,7 +107807,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 ; SI-NEXT: v_or_b32_e32 v26, v0, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 @@ -107786,7 +107818,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s9, s25, 16 ; SI-NEXT: v_mov_b32_e32 v33, v28 ; SI-NEXT: v_or_b32_e32 v28, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 @@ -107798,7 +107830,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s11, s29, 16 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v30, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 ; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_mov_b32_e32 v63, v2 ; SI-NEXT: v_mov_b32_e32 v32, v55 @@ -107806,9 +107838,9 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v55, v4 ; SI-NEXT: v_mov_b32_e32 v53, v6 ; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v59, v42 -; SI-NEXT: v_or_b32_e32 v31, v0, v34 +; SI-NEXT: v_mov_b32_e32 v59, v44 +; SI-NEXT: v_mov_b32_e32 v43, v42 +; SI-NEXT: v_or_b32_e32 v31, v0, v48 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -107818,12 +107850,13 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB71_3 ; SI-NEXT: .LBB71_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v32, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v38, v43 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 @@ -107867,42 +107900,42 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -107964,7 +107997,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -107980,12 +108013,12 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 @@ -107994,7 +108027,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 @@ -108029,26 +108062,26 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB71_4: -; SI-NEXT: v_mov_b32_e32 v38, v61 +; SI-NEXT: v_mov_b32_e32 v34, v58 ; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v58, v61 ; SI-NEXT: v_mov_b32_e32 v63, v2 ; SI-NEXT: v_mov_b32_e32 v55, v4 ; SI-NEXT: v_mov_b32_e32 v53, v6 ; SI-NEXT: v_mov_b32_e32 v52, v57 ; SI-NEXT: v_mov_b32_e32 v51, v50 ; SI-NEXT: v_mov_b32_e32 v61, v56 -; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v50, v45 ; SI-NEXT: v_mov_b32_e32 v36, v41 -; SI-NEXT: v_mov_b32_e32 v57, v40 -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v13 -; SI-NEXT: v_mov_b32_e32 v43, v48 -; SI-NEXT: v_mov_b32_e32 v48, v15 -; SI-NEXT: v_mov_b32_e32 v41, v14 +; SI-NEXT: v_mov_b32_e32 v41, v13 +; SI-NEXT: v_mov_b32_e32 v57, v49 +; SI-NEXT: v_mov_b32_e32 v49, v40 +; SI-NEXT: v_mov_b32_e32 v40, v15 +; SI-NEXT: v_mov_b32_e32 v45, v14 ; SI-NEXT: v_mov_b32_e32 v56, v16 ; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v59, v42 +; SI-NEXT: v_mov_b32_e32 v59, v44 +; SI-NEXT: v_mov_b32_e32 v43, v42 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v37, v20 ; SI-NEXT: v_mov_b32_e32 v39, v23 @@ -114627,7 +114660,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v29 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v1 @@ -114645,7 +114678,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v30 ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v30 ; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v30 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v32 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v32 ; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v32 @@ -114813,9 +114846,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v33, s71 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v33, s69 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s68 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s68 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v33, s67 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v33, s66 @@ -115137,12 +115170,12 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v31 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 ; VI-NEXT: v_or_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -125127,7 +125160,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 ; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:8 @@ -125153,7 +125186,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 @@ -125162,11 +125195,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v5 -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v12, 8, v11 ; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v13 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 @@ -125187,7 +125220,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 @@ -125195,22 +125228,22 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 @@ -125233,23 +125266,24 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v38 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v9 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 @@ -125262,37 +125296,37 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v13 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v3 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 ; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v9 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:76 ; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 @@ -125300,23 +125334,23 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:100 ; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:108 ; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:124 ; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:140 ; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:156 ; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:164 ; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:188 ; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:212 ; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 ; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 ; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:252 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:260 ; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:268 ; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276 @@ -125329,57 +125363,57 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB75_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v2, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -125394,12 +125428,12 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -125428,9 +125462,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v29, v9 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload @@ -125453,15 +125485,15 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v50, v0 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -125471,18 +125503,18 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_mov_b32_e32 v59, v0 ; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v56, v0 -; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v39, v0 ; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -125490,7 +125522,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_mov_b32_e32 v38, v1 ; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v37, v0 @@ -125498,8 +125530,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v36, v0 ; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -125511,39 +125543,41 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v33, v0 -; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v1, v21, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v51, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v34, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v22, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v25, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v34, v22 ; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v43, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v30, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v32, v23 ; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v43, v49 -; VI-NEXT: v_or_b32_sdwa v0, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v32, v54 -; VI-NEXT: v_mov_b32_e32 v34, v26 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v51, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v43, v0 +; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v49, v1 -; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v54, v0 @@ -125553,28 +125587,26 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_mov_b32_e32 v46, v61 ; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v45, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v58, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v58, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v45, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v47, v45 ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v58, v44 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v48, v0 -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v63, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v48, v28 +; VI-NEXT: v_mov_b32_e32 v47, v58 +; VI-NEXT: v_mov_b32_e32 v45, v44 +; VI-NEXT: v_mov_b32_e32 v63, v42 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v42, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v63, v42 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -125590,8 +125622,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v57, v0 ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -125625,44 +125657,43 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB75_3 ; VI-NEXT: .LBB75_2: -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v32, v54 -; VI-NEXT: v_mov_b32_e32 v43, v49 +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v46, v61 -; VI-NEXT: v_mov_b32_e32 v47, v45 -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v34, v26 -; VI-NEXT: v_mov_b32_e32 v58, v44 +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v34, v22 +; VI-NEXT: v_mov_b32_e32 v32, v23 +; VI-NEXT: v_mov_b32_e32 v47, v58 +; VI-NEXT: v_mov_b32_e32 v45, v44 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_mov_b32_e32 v63, v42 ; VI-NEXT: v_mov_b32_e32 v51, v7 -; VI-NEXT: v_mov_b32_e32 v48, v29 +; VI-NEXT: v_mov_b32_e32 v48, v28 ; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: .LBB75_3: ; %Flow ; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v44, v47 -; VI-NEXT: v_mov_b32_e32 v47, v46 +; VI-NEXT: v_mov_b32_e32 v42, v45 +; VI-NEXT: v_mov_b32_e32 v45, v46 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_mov_b32_e32 v46, v49 ; VI-NEXT: s_cbranch_vccnz .LBB75_5 ; VI-NEXT: ; %bb.4: ; %cmp.true -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 @@ -125716,7 +125747,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 @@ -125725,8 +125756,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -125739,8 +125770,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -125815,29 +125846,29 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 @@ -125849,8 +125880,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 @@ -125862,8 +125893,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 @@ -125875,8 +125906,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 @@ -125887,8 +125918,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -125898,8 +125929,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 @@ -125910,8 +125941,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -125921,63 +125952,63 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -125987,54 +126018,57 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v45 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v58 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v42 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v63 -; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v41 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload @@ -126319,7 +126353,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:164 ; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:172 ; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:188 ; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:196 ; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:204 ; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:212 @@ -126327,11 +126361,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:228 ; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:236 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252 ; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:260 ; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:284 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:292 ; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:300 ; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308 @@ -126357,7 +126391,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill @@ -126532,7 +126566,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v37, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -126544,7 +126578,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v37, v57 ; GFX9-NEXT: v_mov_b32_e32 v57, v60 ; GFX9-NEXT: v_mov_b32_e32 v52, v56 -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_mov_b32_e32 v34, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -126553,14 +126587,14 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -126570,12 +126604,12 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v51, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -126629,7 +126663,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: .LBB75_2: ; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload @@ -126991,12 +127025,12 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v45 -; GFX9-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v40 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v44 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 @@ -127006,7 +127040,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v43 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v42 ; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -127015,7 +127049,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v42 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v36 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 @@ -129934,19 +129968,20 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_writelane_b32 v62, s46, 3 ; SI-NEXT: s_cbranch_execnz .LBB77_4 ; SI-NEXT: .LBB77_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[19:20], s[12:13], 1.0 +; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 ; SI-NEXT: v_add_f64 v[3:4], s[6:7], 1.0 -; SI-NEXT: v_add_f64 v[1:2], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v11 +; SI-NEXT: v_add_f64 v[1:2], s[22:23], 1.0 ; SI-NEXT: v_add_f64 v[41:42], s[24:25], 1.0 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v42 ; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v42 @@ -129975,8 +130010,7 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_add_f64 v[31:32], s[42:43], 1.0 ; SI-NEXT: v_add_f64 v[27:28], s[40:41], 1.0 ; SI-NEXT: v_add_f64 v[23:24], s[14:15], 1.0 -; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0 -; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 +; SI-NEXT: v_add_f64 v[19:20], s[12:13], 1.0 ; SI-NEXT: v_add_f64 v[7:8], s[4:5], 1.0 ; SI-NEXT: v_add_f64 v[59:60], s[18:19], 1.0 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v8 @@ -129985,27 +130019,27 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v12 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v11 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v16 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v16 -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v20 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v24 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v28 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v28 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v28 ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v32 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v32 ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v36 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v36 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v36 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v36 ; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v35 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 ; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v50 @@ -130096,14 +130130,14 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: ; kill: killed $sgpr46 ; SI-NEXT: s_branch .LBB77_2 ; SI-NEXT: .LBB77_4: -; SI-NEXT: v_mov_b32_e32 v1, s71 +; SI-NEXT: v_mov_b32_e32 v1, s85 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s69 +; SI-NEXT: v_mov_b32_e32 v1, s83 ; SI-NEXT: v_readlane_b32 s4, v62, 0 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s68 +; SI-NEXT: v_mov_b32_e32 v1, s82 ; SI-NEXT: v_mov_b32_e32 v61, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 1 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -130122,27 +130156,27 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_mov_b32_e32 v14, s96 ; SI-NEXT: v_mov_b32_e32 v21, s87 ; SI-NEXT: v_mov_b32_e32 v18, s86 -; SI-NEXT: v_mov_b32_e32 v25, s85 ; SI-NEXT: v_mov_b32_e32 v22, s84 -; SI-NEXT: v_mov_b32_e32 v29, s83 -; SI-NEXT: v_mov_b32_e32 v26, s82 -; SI-NEXT: v_mov_b32_e32 v33, s81 -; SI-NEXT: v_mov_b32_e32 v30, s80 -; SI-NEXT: v_mov_b32_e32 v34, s70 +; SI-NEXT: v_mov_b32_e32 v16, s81 +; SI-NEXT: v_mov_b32_e32 v15, s80 +; SI-NEXT: v_mov_b32_e32 v12, s71 +; SI-NEXT: v_mov_b32_e32 v11, s70 +; SI-NEXT: v_mov_b32_e32 v20, s69 +; SI-NEXT: v_mov_b32_e32 v19, s68 ; SI-NEXT: v_mov_b32_e32 v8, s67 ; SI-NEXT: v_mov_b32_e32 v7, s66 ; SI-NEXT: v_mov_b32_e32 v24, s65 ; SI-NEXT: v_mov_b32_e32 v23, s64 -; SI-NEXT: v_mov_b32_e32 v16, s55 -; SI-NEXT: v_mov_b32_e32 v15, s54 +; SI-NEXT: v_mov_b32_e32 v26, s55 +; SI-NEXT: v_mov_b32_e32 v25, s54 ; SI-NEXT: v_mov_b32_e32 v28, s53 ; SI-NEXT: v_mov_b32_e32 v27, s52 -; SI-NEXT: v_mov_b32_e32 v12, s51 -; SI-NEXT: v_mov_b32_e32 v11, s50 +; SI-NEXT: v_mov_b32_e32 v30, s51 +; SI-NEXT: v_mov_b32_e32 v29, s50 ; SI-NEXT: v_mov_b32_e32 v32, s49 ; SI-NEXT: v_mov_b32_e32 v31, s48 -; SI-NEXT: v_mov_b32_e32 v20, s39 -; SI-NEXT: v_mov_b32_e32 v19, s38 +; SI-NEXT: v_mov_b32_e32 v34, s39 +; SI-NEXT: v_mov_b32_e32 v33, s38 ; SI-NEXT: v_mov_b32_e32 v36, s37 ; SI-NEXT: v_mov_b32_e32 v35, s36 ; SI-NEXT: v_mov_b32_e32 v38, s35 @@ -130275,9 +130309,9 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v33 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -130289,9 +130323,9 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -130303,9 +130337,9 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -130324,6 +130358,27 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v11 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_readlane_b32 s99, v63, 35 @@ -130368,36 +130423,15 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v30 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v29 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v26 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -133711,10 +133745,10 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s18 ; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s22 ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill @@ -133743,9 +133777,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB79_4 @@ -133756,12 +133790,13 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v3, v3, v35, 16 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_mov_b32_e32 v59, v2 ; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 @@ -133771,10 +133806,11 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: v_mov_b32_e32 v47, v10 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16 -; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v44, v14 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_alignbit_b32 v1, v1, v33, 16 ; SI-NEXT: v_mov_b32_e32 v62, v38 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 @@ -133814,20 +133850,20 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 ; SI-NEXT: v_mov_b32_e32 v37, v34 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(2) ; SI-NEXT: v_mov_b32_e32 v35, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_mov_b32_e32 v43, v8 ; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v60, v9 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 ; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v8 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 @@ -133851,7 +133887,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v44, v14 +; SI-NEXT: v_mov_b32_e32 v33, v14 ; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) @@ -133874,7 +133910,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: .LBB79_2: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -133890,7 +133926,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57 @@ -133902,7 +133938,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36 @@ -134014,7 +134050,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -134037,7 +134073,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58 @@ -134052,7 +134088,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62 @@ -134138,16 +134174,16 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB79_4: ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v61, v53 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload @@ -134156,7 +134192,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: v_mov_b32_e32 v57, v11 ; SI-NEXT: v_mov_b32_e32 v47, v10 ; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v44, v14 ; SI-NEXT: v_mov_b32_e32 v62, v38 ; SI-NEXT: v_mov_b32_e32 v38, v39 ; SI-NEXT: v_mov_b32_e32 v39, v41 @@ -137258,70 +137294,68 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: s_lshr_b32 s46, s5, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s46 ; SI-NEXT: s_lshr_b32 s46, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s46 ; SI-NEXT: s_lshr_b32 s46, s7, 16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, s46 -; SI-NEXT: s_lshr_b32 s46, s6, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s46 -; SI-NEXT: s_lshr_b32 s46, s9, 16 +; SI-NEXT: s_lshr_b32 s46, s6, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v9, s46 -; SI-NEXT: s_lshr_b32 s46, s8, 16 +; SI-NEXT: s_lshr_b32 s46, s9, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s46 -; SI-NEXT: s_lshr_b32 s46, s11, 16 +; SI-NEXT: s_lshr_b32 s46, s8, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s46 -; SI-NEXT: s_lshr_b32 s46, s10, 16 +; SI-NEXT: s_lshr_b32 s46, s11, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s46 -; SI-NEXT: s_lshr_b32 s46, s13, 16 +; SI-NEXT: s_lshr_b32 s46, s10, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v31, s46 -; SI-NEXT: s_lshr_b32 s46, s12, 16 +; SI-NEXT: s_lshr_b32 s46, s13, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v38, s46 -; SI-NEXT: s_lshr_b32 s46, s15, 16 +; SI-NEXT: s_lshr_b32 s46, s12, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v50, s46 -; SI-NEXT: s_lshr_b32 s46, s14, 16 +; SI-NEXT: s_lshr_b32 s46, s15, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v54, s46 -; SI-NEXT: s_lshr_b32 s46, s41, 16 +; SI-NEXT: s_lshr_b32 s46, s14, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v40, s46 -; SI-NEXT: s_lshr_b32 s46, s40, 16 +; SI-NEXT: s_lshr_b32 s46, s41, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v42, s46 +; SI-NEXT: s_lshr_b32 s46, s40, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s46 ; SI-NEXT: s_lshr_b32 s46, s43, 16 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v61, s46 -; SI-NEXT: s_lshr_b32 s46, s42, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s46 +; SI-NEXT: s_lshr_b32 s46, s42, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s46 ; SI-NEXT: s_lshr_b32 s46, s45, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v32, s46 ; SI-NEXT: s_lshr_b32 s46, s44, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v36, s46 ; SI-NEXT: s_lshr_b32 s46, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s46 ; SI-NEXT: s_lshr_b32 s46, s28, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v48, s46 ; SI-NEXT: s_lshr_b32 s46, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v24, s46 ; SI-NEXT: s_lshr_b32 s46, s26, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v52, s46 ; SI-NEXT: s_lshr_b32 s46, s25, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s46 ; SI-NEXT: s_lshr_b32 s46, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s46 ; SI-NEXT: s_lshr_b32 s46, s23, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s46 ; SI-NEXT: s_lshr_b32 s46, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s46 ; SI-NEXT: s_lshr_b32 s46, s21, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v46, s46 ; SI-NEXT: s_lshr_b32 s46, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s46 ; SI-NEXT: s_lshr_b32 s46, s19, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s46 ; SI-NEXT: s_lshr_b32 s46, s18, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v58, s46 ; SI-NEXT: s_lshr_b32 s46, s17, 16 @@ -137329,8 +137363,7 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: s_lshr_b32 s46, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v62, s46 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s8 @@ -137338,11 +137371,12 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v19, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v39, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v51, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s15 ; SI-NEXT: v_cvt_f32_f16_e32 v43, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v63, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s42 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f32_f16_e32 v37, s29 ; SI-NEXT: v_cvt_f32_f16_e32 v22, s28 @@ -137356,7 +137390,7 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v47, s20 ; SI-NEXT: v_cvt_f32_f16_e32 v34, s19 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v63, s17 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 ; SI-NEXT: s_cbranch_execnz .LBB81_3 @@ -137381,143 +137415,146 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_add_f64 v[14:15], s[10:11], 1.0 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v36 -; SI-NEXT: v_add_f64 v[14:15], s[10:11], 1.0 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v37 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 -; SI-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v28 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_add_f64 v[29:30], s[42:43], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v29 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 +; SI-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 -; SI-NEXT: v_add_f64 v[10:11], s[8:9], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v24 ; SI-NEXT: v_add_f64 v[25:26], s[40:41], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 +; SI-NEXT: v_add_f64 v[33:34], s[44:45], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v37 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v33 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v34 ; SI-NEXT: v_add_f64 v[21:22], s[14:15], 1.0 -; SI-NEXT: v_add_f64 v[33:34], s[44:45], 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 ; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v21 ; SI-NEXT: v_add_f64 v[18:19], s[12:13], 1.0 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v30 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v1 ; SI-NEXT: v_add_f64 v[1:2], s[18:19], 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f32_f16_e32 v51, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v25 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 ; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v7 -; SI-NEXT: v_mov_b32_e32 v7, v61 -; SI-NEXT: v_mov_b32_e32 v61, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v19 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v28 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_add_f64 v[10:11], s[8:9], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v17 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 ; SI-NEXT: v_add_f64 v[46:47], s[20:21], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v15 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v39, v19 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v46 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v47 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v15 +; SI-NEXT: v_mov_b32_e32 v15, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v18 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v18, v12 +; SI-NEXT: v_mov_b32_e32 v12, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v47 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v47, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v15 -; SI-NEXT: v_mov_b32_e32 v15, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 -; SI-NEXT: v_mov_b32_e32 v14, v12 -; SI-NEXT: v_mov_b32_e32 v12, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v4 -; SI-NEXT: v_mov_b32_e32 v18, v3 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v7, v57 +; SI-NEXT: v_mov_b32_e32 v57, v3 ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v24, v13 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v52, v13 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: .LBB81_3: ; %end ; SI-NEXT: v_cvt_f16_f32_e32 v3, v62 @@ -137528,7 +137565,7 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v63 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 @@ -137541,13 +137578,13 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v47 ; SI-NEXT: v_add_i32_e32 v4, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -137561,28 +137598,28 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 ; SI-NEXT: v_add_i32_e32 v4, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 ; SI-NEXT: v_add_i32_e32 v4, vcc, 28, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 ; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v53 ; SI-NEXT: v_add_i32_e32 v4, vcc, 36, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -137610,7 +137647,7 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v37 ; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -137632,111 +137669,111 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v61 ; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v59 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v57 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v43 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v55 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x54, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v51 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v39 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v19 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v11 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -137772,24 +137809,24 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr49 @@ -137797,39 +137834,39 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -142103,24 +142140,24 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:68 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21 @@ -142141,23 +142178,23 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 ; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v36 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v32 +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill @@ -142170,45 +142207,46 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB87_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_or_b32_e32 v7, v0, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v9, v0, v50 +; SI-NEXT: v_or_b32_e32 v9, v0, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v10, v0, v43 +; SI-NEXT: v_or_b32_e32 v10, v0, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 -; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_or_b32_e32 v11, v0, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: v_or_b32_e32 v12, v0, v40 +; SI-NEXT: v_or_b32_e32 v12, v0, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_mov_b32_e32 v52, v57 -; SI-NEXT: v_mov_b32_e32 v57, v40 -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v13 +; SI-NEXT: v_mov_b32_e32 v36, v41 +; SI-NEXT: v_mov_b32_e32 v41, v13 ; SI-NEXT: v_or_b32_e32 v13, v0, v13 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 -; SI-NEXT: v_mov_b32_e32 v36, v41 -; SI-NEXT: v_mov_b32_e32 v41, v14 -; SI-NEXT: v_or_b32_e32 v14, v0, v48 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 ; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v48 -; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: v_mov_b32_e32 v50, v45 +; SI-NEXT: v_mov_b32_e32 v45, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_mov_b32_e32 v52, v57 +; SI-NEXT: v_mov_b32_e32 v57, v49 +; SI-NEXT: v_mov_b32_e32 v49, v40 +; SI-NEXT: v_mov_b32_e32 v40, v15 ; SI-NEXT: v_or_b32_e32 v15, v0, v15 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; SI-NEXT: v_mov_b32_e32 v38, v61 +; SI-NEXT: v_mov_b32_e32 v34, v58 +; SI-NEXT: v_mov_b32_e32 v58, v61 ; SI-NEXT: v_mov_b32_e32 v61, v56 ; SI-NEXT: v_mov_b32_e32 v56, v16 ; SI-NEXT: v_or_b32_e32 v16, v0, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_or_b32_e32 v17, v0, v17 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 ; SI-NEXT: s_waitcnt expcnt(0) @@ -142242,7 +142280,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 ; SI-NEXT: v_or_b32_e32 v26, v0, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 @@ -142253,7 +142291,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: s_lshl_b32 s9, s25, 16 ; SI-NEXT: v_mov_b32_e32 v33, v28 ; SI-NEXT: v_or_b32_e32 v28, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 @@ -142265,7 +142303,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: s_lshl_b32 s11, s29, 16 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v30, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 ; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_mov_b32_e32 v63, v2 ; SI-NEXT: v_mov_b32_e32 v32, v55 @@ -142273,9 +142311,9 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v55, v4 ; SI-NEXT: v_mov_b32_e32 v53, v6 ; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v59, v42 -; SI-NEXT: v_or_b32_e32 v31, v0, v34 +; SI-NEXT: v_mov_b32_e32 v59, v44 +; SI-NEXT: v_mov_b32_e32 v43, v42 +; SI-NEXT: v_or_b32_e32 v31, v0, v48 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -142285,12 +142323,13 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB87_3 ; SI-NEXT: .LBB87_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v32, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v38, v43 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 @@ -142334,42 +142373,42 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -142431,7 +142470,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -142447,12 +142486,12 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 @@ -142461,7 +142500,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 @@ -142496,26 +142535,26 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB87_4: -; SI-NEXT: v_mov_b32_e32 v38, v61 +; SI-NEXT: v_mov_b32_e32 v34, v58 ; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v58, v61 ; SI-NEXT: v_mov_b32_e32 v63, v2 ; SI-NEXT: v_mov_b32_e32 v55, v4 ; SI-NEXT: v_mov_b32_e32 v53, v6 ; SI-NEXT: v_mov_b32_e32 v52, v57 ; SI-NEXT: v_mov_b32_e32 v51, v50 ; SI-NEXT: v_mov_b32_e32 v61, v56 -; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v50, v45 ; SI-NEXT: v_mov_b32_e32 v36, v41 -; SI-NEXT: v_mov_b32_e32 v57, v40 -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v13 -; SI-NEXT: v_mov_b32_e32 v43, v48 -; SI-NEXT: v_mov_b32_e32 v48, v15 -; SI-NEXT: v_mov_b32_e32 v41, v14 +; SI-NEXT: v_mov_b32_e32 v41, v13 +; SI-NEXT: v_mov_b32_e32 v57, v49 +; SI-NEXT: v_mov_b32_e32 v49, v40 +; SI-NEXT: v_mov_b32_e32 v40, v15 +; SI-NEXT: v_mov_b32_e32 v45, v14 ; SI-NEXT: v_mov_b32_e32 v56, v16 ; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v59, v42 +; SI-NEXT: v_mov_b32_e32 v59, v44 +; SI-NEXT: v_mov_b32_e32 v43, v42 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v37, v20 ; SI-NEXT: v_mov_b32_e32 v39, v23 @@ -149422,6 +149461,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 @@ -149432,15 +149472,18 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:312 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:308 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304 -; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane -; SI-NEXT: s_mov_b32 s72, s21 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_writelane_b32 v43, s19, 0 -; SI-NEXT: v_writelane_b32 v43, s18, 1 -; SI-NEXT: v_writelane_b32 v43, s17, 2 -; SI-NEXT: v_writelane_b32 v43, s16, 3 -; SI-NEXT: s_mov_b32 s60, s24 +; SI-NEXT: ; implicit-def: $vgpr44 : SGPR spill to VGPR lane +; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_writelane_b32 v41, s30, 0 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v44, s29, 0 +; SI-NEXT: v_writelane_b32 v44, s28, 1 +; SI-NEXT: v_writelane_b32 v44, s27, 2 +; SI-NEXT: v_writelane_b32 v44, s26, 3 +; SI-NEXT: v_writelane_b32 v44, s19, 4 +; SI-NEXT: v_writelane_b32 v44, s18, 5 +; SI-NEXT: v_writelane_b32 v44, s17, 6 +; SI-NEXT: v_writelane_b32 v44, s16, 7 ; SI-NEXT: v_writelane_b32 v41, s31, 1 ; SI-NEXT: v_writelane_b32 v41, s34, 2 ; SI-NEXT: v_writelane_b32 v41, s35, 3 @@ -149464,8 +149507,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_writelane_b32 v41, s69, 21 ; SI-NEXT: v_writelane_b32 v41, s70, 22 ; SI-NEXT: v_writelane_b32 v41, s71, 23 -; SI-NEXT: s_mov_b32 s77, s28 -; SI-NEXT: s_mov_b32 s76, s27 ; SI-NEXT: v_writelane_b32 v41, s80, 24 ; SI-NEXT: v_writelane_b32 v41, s81, 25 ; SI-NEXT: v_writelane_b32 v41, s82, 26 @@ -149476,100 +149517,92 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_writelane_b32 v41, s87, 31 ; SI-NEXT: v_writelane_b32 v41, s96, 32 ; SI-NEXT: v_writelane_b32 v41, s97, 33 +; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane +; SI-NEXT: v_readfirstlane_b32 s30, v16 +; SI-NEXT: v_readfirstlane_b32 s31, v15 +; SI-NEXT: v_readfirstlane_b32 s34, v21 +; SI-NEXT: v_readfirstlane_b32 s35, v22 +; SI-NEXT: v_readfirstlane_b32 s36, v20 +; SI-NEXT: v_readfirstlane_b32 s37, v19 +; SI-NEXT: v_readfirstlane_b32 s38, v25 +; SI-NEXT: v_readfirstlane_b32 s39, v26 +; SI-NEXT: v_readfirstlane_b32 s48, v24 +; SI-NEXT: v_readfirstlane_b32 s49, v23 +; SI-NEXT: v_readfirstlane_b32 s50, v29 +; SI-NEXT: v_readfirstlane_b32 s51, v30 +; SI-NEXT: v_readfirstlane_b32 s52, v28 +; SI-NEXT: v_readfirstlane_b32 s53, v27 ; SI-NEXT: v_writelane_b32 v41, s98, 34 ; SI-NEXT: v_writelane_b32 v41, s99, 35 -; SI-NEXT: s_mov_b32 s79, s26 -; SI-NEXT: v_readfirstlane_b32 s38, v20 -; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane -; SI-NEXT: v_readfirstlane_b32 s39, v19 -; SI-NEXT: v_writelane_b32 v42, s38, 0 -; SI-NEXT: v_readfirstlane_b32 s48, v25 -; SI-NEXT: v_writelane_b32 v42, s39, 1 -; SI-NEXT: v_readfirstlane_b32 s49, v26 -; SI-NEXT: v_writelane_b32 v42, s48, 2 -; SI-NEXT: v_readfirstlane_b32 s50, v24 -; SI-NEXT: v_writelane_b32 v42, s49, 3 -; SI-NEXT: v_readfirstlane_b32 s51, v23 -; SI-NEXT: v_writelane_b32 v42, s50, 4 -; SI-NEXT: v_readfirstlane_b32 s52, v29 -; SI-NEXT: v_writelane_b32 v42, s51, 5 -; SI-NEXT: v_readfirstlane_b32 s53, v30 -; SI-NEXT: v_writelane_b32 v42, s52, 6 -; SI-NEXT: v_readfirstlane_b32 s54, v28 -; SI-NEXT: v_writelane_b32 v42, s53, 7 -; SI-NEXT: v_readfirstlane_b32 s55, v27 -; SI-NEXT: v_writelane_b32 v42, s54, 8 -; SI-NEXT: v_writelane_b32 v42, s55, 9 +; SI-NEXT: s_mov_b32 s6, s21 ; SI-NEXT: v_readfirstlane_b32 s16, v1 ; SI-NEXT: v_readfirstlane_b32 s17, v2 ; SI-NEXT: v_readfirstlane_b32 s18, v5 ; SI-NEXT: v_readfirstlane_b32 s19, v6 -; SI-NEXT: v_readfirstlane_b32 s88, v4 -; SI-NEXT: v_readfirstlane_b32 s89, v3 -; SI-NEXT: v_readfirstlane_b32 s90, v9 +; SI-NEXT: v_readfirstlane_b32 s78, v4 +; SI-NEXT: v_readfirstlane_b32 s79, v3 +; SI-NEXT: v_readfirstlane_b32 s88, v9 +; SI-NEXT: v_readfirstlane_b32 s89, v10 +; SI-NEXT: v_readfirstlane_b32 s90, v8 +; SI-NEXT: v_readfirstlane_b32 s91, v7 +; SI-NEXT: v_readfirstlane_b32 s92, v13 +; SI-NEXT: v_readfirstlane_b32 s93, v14 +; SI-NEXT: v_readfirstlane_b32 s94, v12 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s6, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:296 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:292 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:288 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:280 +; SI-NEXT: v_writelane_b32 v44, s4, 8 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v32 -; SI-NEXT: v_writelane_b32 v43, s4, 4 +; SI-NEXT: v_writelane_b32 v44, s4, 9 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v33 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 -; SI-NEXT: v_writelane_b32 v43, s4, 5 +; SI-NEXT: v_writelane_b32 v44, s4, 10 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v34 -; SI-NEXT: v_writelane_b32 v43, s4, 6 +; SI-NEXT: v_writelane_b32 v44, s4, 11 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v35 -; SI-NEXT: v_writelane_b32 v43, s4, 7 +; SI-NEXT: v_writelane_b32 v44, s4, 12 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s4, v36 -; SI-NEXT: v_writelane_b32 v43, s4, 8 +; SI-NEXT: v_writelane_b32 v44, s4, 13 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s4, v37 +; SI-NEXT: v_writelane_b32 v44, s4, 14 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:268 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:256 -; SI-NEXT: v_writelane_b32 v43, s4, 9 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v38 -; SI-NEXT: v_writelane_b32 v43, s4, 10 -; SI-NEXT: v_readfirstlane_b32 s91, v10 -; SI-NEXT: v_readfirstlane_b32 s92, v8 -; SI-NEXT: v_readfirstlane_b32 s93, v7 -; SI-NEXT: v_readfirstlane_b32 s94, v13 -; SI-NEXT: v_readfirstlane_b32 s95, v14 -; SI-NEXT: v_readfirstlane_b32 s30, v17 -; SI-NEXT: v_readfirstlane_b32 s31, v18 -; SI-NEXT: v_readfirstlane_b32 s34, v16 -; SI-NEXT: v_readfirstlane_b32 s35, v15 -; SI-NEXT: v_readfirstlane_b32 s36, v21 -; SI-NEXT: v_readfirstlane_b32 s37, v22 +; SI-NEXT: v_writelane_b32 v44, s4, 15 +; SI-NEXT: v_readfirstlane_b32 s95, v11 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: v_writelane_b32 v43, s4, 11 +; SI-NEXT: v_writelane_b32 v44, s4, 16 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s4, v39 -; SI-NEXT: v_writelane_b32 v43, s4, 12 +; SI-NEXT: v_writelane_b32 v44, s4, 17 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s4, v48 -; SI-NEXT: v_writelane_b32 v43, s4, 13 +; SI-NEXT: v_writelane_b32 v44, s4, 18 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s4, v49 -; SI-NEXT: v_writelane_b32 v43, s4, 14 +; SI-NEXT: v_writelane_b32 v44, s4, 19 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s4, v50 -; SI-NEXT: v_writelane_b32 v43, s4, 15 +; SI-NEXT: v_writelane_b32 v44, s4, 20 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s4, v51 +; SI-NEXT: v_writelane_b32 v44, s4, 21 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:248 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:244 @@ -149578,39 +149611,49 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:228 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s75, v32 +; SI-NEXT: v_readfirstlane_b32 s4, v32 +; SI-NEXT: v_writelane_b32 v44, s4, 22 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s61, v33 +; SI-NEXT: v_readfirstlane_b32 s4, v33 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220 -; SI-NEXT: v_writelane_b32 v43, s4, 16 +; SI-NEXT: v_writelane_b32 v44, s4, 23 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s43, v34 +; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: v_writelane_b32 v44, s4, 24 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s40, v35 +; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: v_writelane_b32 v44, s4, 25 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: v_writelane_b32 v44, s4, 26 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s63, v37 +; SI-NEXT: v_readfirstlane_b32 s4, v37 +; SI-NEXT: v_writelane_b32 v44, s4, 27 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:212 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:208 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:204 -; SI-NEXT: v_writelane_b32 v43, s4, 17 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s59, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_writelane_b32 v44, s4, 28 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s42, v38 +; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: v_writelane_b32 v44, s4, 29 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s73, v39 +; SI-NEXT: v_readfirstlane_b32 s4, v39 +; SI-NEXT: v_writelane_b32 v44, s4, 30 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s21, v48 +; SI-NEXT: v_readfirstlane_b32 s4, v48 +; SI-NEXT: v_writelane_b32 v44, s4, 31 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s57, v49 +; SI-NEXT: v_readfirstlane_b32 s4, v49 +; SI-NEXT: v_writelane_b32 v44, s4, 32 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s13, v50 +; SI-NEXT: v_readfirstlane_b32 s4, v50 +; SI-NEXT: v_writelane_b32 v44, s4, 33 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s45, v51 +; SI-NEXT: v_readfirstlane_b32 s4, v51 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:196 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 @@ -149618,51 +149661,43 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:184 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s47, v32 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s24, v33 +; SI-NEXT: v_readfirstlane_b32 s58, v33 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:168 +; SI-NEXT: v_readfirstlane_b32 s26, v32 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s77, v34 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s63, v35 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s57, v36 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s56, v37 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160 ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s78, v34 -; SI-NEXT: v_readfirstlane_b32 s4, v35 -; SI-NEXT: v_writelane_b32 v43, s4, 18 -; SI-NEXT: v_readfirstlane_b32 s4, v36 -; SI-NEXT: v_writelane_b32 v43, s4, 19 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_readfirstlane_b32 s4, v37 -; SI-NEXT: v_writelane_b32 v43, s4, 20 +; SI-NEXT: v_writelane_b32 v44, s4, 34 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: v_writelane_b32 v43, s4, 21 +; SI-NEXT: v_readfirstlane_b32 s61, v31 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s4, v38 -; SI-NEXT: v_writelane_b32 v43, s4, 22 +; SI-NEXT: v_readfirstlane_b32 s74, v38 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s4, v39 -; SI-NEXT: v_writelane_b32 v43, s4, 23 +; SI-NEXT: v_readfirstlane_b32 s76, v39 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s4, v48 -; SI-NEXT: v_writelane_b32 v43, s4, 24 +; SI-NEXT: v_readfirstlane_b32 s47, v48 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s4, v49 -; SI-NEXT: v_writelane_b32 v43, s4, 25 +; SI-NEXT: v_readfirstlane_b32 s45, v49 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s4, v50 -; SI-NEXT: v_writelane_b32 v43, s4, 26 +; SI-NEXT: v_readfirstlane_b32 s60, v50 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s4, v51 -; SI-NEXT: v_writelane_b32 v43, s4, 27 +; SI-NEXT: v_readfirstlane_b32 s42, v51 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s4, v33 +; SI-NEXT: v_readfirstlane_b32 s13, v33 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:140 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 @@ -149674,43 +149709,43 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 -; SI-NEXT: v_writelane_b32 v43, s4, 28 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s4, v52 -; SI-NEXT: v_writelane_b32 v43, s4, 29 -; SI-NEXT: v_readfirstlane_b32 s4, v53 -; SI-NEXT: v_writelane_b32 v43, s4, 30 -; SI-NEXT: v_readfirstlane_b32 s4, v54 -; SI-NEXT: v_writelane_b32 v43, s4, 31 -; SI-NEXT: v_readfirstlane_b32 s4, v55 -; SI-NEXT: v_writelane_b32 v43, s4, 32 +; SI-NEXT: v_readfirstlane_b32 s72, v52 +; SI-NEXT: v_readfirstlane_b32 s73, v53 +; SI-NEXT: v_readfirstlane_b32 s44, v55 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_readfirstlane_b32 s4, v40 -; SI-NEXT: v_writelane_b32 v43, s4, 33 -; SI-NEXT: v_writelane_b32 v43, s22, 34 -; SI-NEXT: v_writelane_b32 v43, s23, 35 -; SI-NEXT: v_writelane_b32 v43, s72, 36 -; SI-NEXT: v_writelane_b32 v43, s20, 37 -; SI-NEXT: v_writelane_b32 v43, s79, 38 -; SI-NEXT: v_writelane_b32 v43, s76, 39 -; SI-NEXT: v_writelane_b32 v43, s25, 40 -; SI-NEXT: v_writelane_b32 v43, s60, 41 -; SI-NEXT: v_writelane_b32 v43, s29, 42 -; SI-NEXT: v_writelane_b32 v43, s77, 43 -; SI-NEXT: v_writelane_b32 v43, s16, 44 -; SI-NEXT: v_writelane_b32 v43, s17, 45 -; SI-NEXT: v_writelane_b32 v43, s18, 46 -; SI-NEXT: v_writelane_b32 v43, s19, 47 -; SI-NEXT: v_writelane_b32 v43, s88, 48 -; SI-NEXT: v_writelane_b32 v43, s89, 49 -; SI-NEXT: v_writelane_b32 v43, s90, 50 -; SI-NEXT: v_writelane_b32 v43, s91, 51 -; SI-NEXT: v_writelane_b32 v43, s92, 52 -; SI-NEXT: v_writelane_b32 v43, s93, 53 -; SI-NEXT: v_writelane_b32 v43, s94, 54 -; SI-NEXT: v_writelane_b32 v43, s95, 55 +; SI-NEXT: v_writelane_b32 v44, s4, 35 +; SI-NEXT: v_writelane_b32 v44, s22, 36 +; SI-NEXT: v_writelane_b32 v44, s23, 37 +; SI-NEXT: v_writelane_b32 v44, s6, 38 +; SI-NEXT: v_writelane_b32 v44, s20, 39 +; SI-NEXT: v_writelane_b32 v44, s25, 40 +; SI-NEXT: v_writelane_b32 v44, s24, 41 +; SI-NEXT: v_writelane_b32 v44, s44, 42 +; SI-NEXT: v_writelane_b32 v44, s72, 43 +; SI-NEXT: v_writelane_b32 v44, s13, 44 +; SI-NEXT: v_writelane_b32 v44, s60, 45 +; SI-NEXT: v_writelane_b32 v44, s73, 46 +; SI-NEXT: v_readfirstlane_b32 s21, v54 +; SI-NEXT: v_writelane_b32 v44, s42, 47 +; SI-NEXT: v_writelane_b32 v44, s21, 48 +; SI-NEXT: v_writelane_b32 v44, s16, 49 +; SI-NEXT: v_writelane_b32 v44, s17, 50 +; SI-NEXT: v_writelane_b32 v44, s18, 51 +; SI-NEXT: v_writelane_b32 v44, s19, 52 +; SI-NEXT: v_writelane_b32 v44, s78, 53 +; SI-NEXT: v_writelane_b32 v44, s79, 54 +; SI-NEXT: v_writelane_b32 v44, s88, 55 +; SI-NEXT: v_writelane_b32 v44, s89, 56 +; SI-NEXT: v_writelane_b32 v44, s90, 57 +; SI-NEXT: v_writelane_b32 v44, s91, 58 +; SI-NEXT: v_writelane_b32 v44, s92, 59 +; SI-NEXT: v_writelane_b32 v44, s93, 60 +; SI-NEXT: v_writelane_b32 v44, s94, 61 +; SI-NEXT: v_writelane_b32 v44, s95, 62 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s62, v33 +; SI-NEXT: v_readfirstlane_b32 s40, v33 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s10, v34 ; SI-NEXT: s_waitcnt vmcnt(8) @@ -149718,13 +149753,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_readfirstlane_b32 s28, v31 ; SI-NEXT: v_readfirstlane_b32 s27, v32 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s58, v36 +; SI-NEXT: v_readfirstlane_b32 s29, v36 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s69, v37 +; SI-NEXT: v_readfirstlane_b32 s70, v37 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_readfirstlane_b32 s14, v38 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s68, v39 +; SI-NEXT: v_readfirstlane_b32 s69, v39 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 @@ -149739,42 +149774,50 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s11, v49 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s70, v50 +; SI-NEXT: v_readfirstlane_b32 s71, v50 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s71, v51 +; SI-NEXT: v_readfirstlane_b32 s81, v51 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 vcc_lo, v12 -; SI-NEXT: v_readfirstlane_b32 vcc_hi, v11 -; SI-NEXT: v_writelane_b32 v43, vcc_lo, 56 -; SI-NEXT: v_writelane_b32 v43, vcc_hi, 57 -; SI-NEXT: v_writelane_b32 v43, s30, 58 -; SI-NEXT: v_writelane_b32 v43, s31, 59 -; SI-NEXT: v_writelane_b32 v43, s34, 60 -; SI-NEXT: v_writelane_b32 v43, s35, 61 -; SI-NEXT: v_writelane_b32 v43, s36, 62 -; SI-NEXT: v_writelane_b32 v43, s37, 63 +; SI-NEXT: v_readfirstlane_b32 vcc_hi, v18 +; SI-NEXT: v_writelane_b32 v43, vcc_hi, 0 +; SI-NEXT: v_writelane_b32 v43, s30, 1 +; SI-NEXT: v_writelane_b32 v43, s31, 2 +; SI-NEXT: v_writelane_b32 v43, s34, 3 +; SI-NEXT: v_writelane_b32 v43, s35, 4 +; SI-NEXT: v_writelane_b32 v43, s36, 5 +; SI-NEXT: v_writelane_b32 v43, s37, 6 +; SI-NEXT: v_writelane_b32 v43, s38, 7 +; SI-NEXT: v_writelane_b32 v43, s39, 8 +; SI-NEXT: v_writelane_b32 v43, s48, 9 +; SI-NEXT: v_writelane_b32 v43, s49, 10 +; SI-NEXT: v_writelane_b32 v43, s50, 11 +; SI-NEXT: v_writelane_b32 v43, s51, 12 +; SI-NEXT: v_writelane_b32 v43, s52, 13 +; SI-NEXT: v_writelane_b32 v43, s53, 14 +; SI-NEXT: v_readfirstlane_b32 vcc_lo, v17 +; SI-NEXT: v_writelane_b32 v44, vcc_lo, 63 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s74, v31 +; SI-NEXT: v_readfirstlane_b32 s46, v31 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s46, v32 +; SI-NEXT: v_readfirstlane_b32 s59, v32 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s96, v33 +; SI-NEXT: v_readfirstlane_b32 s83, v33 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s98, v34 +; SI-NEXT: v_readfirstlane_b32 s12, v34 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s41, v35 +; SI-NEXT: v_readfirstlane_b32 s97, v35 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s56, v36 +; SI-NEXT: v_readfirstlane_b32 s8, v36 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s87, v37 +; SI-NEXT: v_readfirstlane_b32 s84, v37 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s99, v38 +; SI-NEXT: v_readfirstlane_b32 s86, v38 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s81, v39 +; SI-NEXT: v_readfirstlane_b32 s15, v39 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 @@ -149784,415 +149827,417 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s26, v48 +; SI-NEXT: v_readfirstlane_b32 s62, v48 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s83, v49 +; SI-NEXT: v_readfirstlane_b32 s96, v49 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s82, v50 +; SI-NEXT: v_readfirstlane_b32 s7, v50 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s7, v51 +; SI-NEXT: v_readfirstlane_b32 s80, v51 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s15, v31 +; SI-NEXT: v_readfirstlane_b32 s41, v31 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s97, v32 +; SI-NEXT: v_readfirstlane_b32 s98, v32 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s44, v33 +; SI-NEXT: v_readfirstlane_b32 s99, v33 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s9, v34 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s80, v35 +; SI-NEXT: v_readfirstlane_b32 s82, v35 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s86, v36 +; SI-NEXT: v_readfirstlane_b32 s68, v36 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s85, v37 +; SI-NEXT: v_readfirstlane_b32 s67, v37 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s8, v38 +; SI-NEXT: v_readfirstlane_b32 s85, v38 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s12, v39 +; SI-NEXT: v_readfirstlane_b32 s87, v39 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_readfirstlane_b32 s65, v48 +; SI-NEXT: v_readfirstlane_b32 s55, v48 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_readfirstlane_b32 s64, v49 -; SI-NEXT: v_writelane_b32 v42, s64, 10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_readfirstlane_b32 s67, v50 -; SI-NEXT: v_writelane_b32 v42, s65, 11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s84, v51 -; SI-NEXT: v_writelane_b32 v42, s67, 12 -; SI-NEXT: v_writelane_b32 v42, s84, 13 -; SI-NEXT: v_writelane_b32 v42, s85, 14 -; SI-NEXT: v_writelane_b32 v42, s86, 15 -; SI-NEXT: v_writelane_b32 v42, s87, 16 -; SI-NEXT: v_writelane_b32 v42, s8, 17 -; SI-NEXT: v_writelane_b32 v42, s99, 18 -; SI-NEXT: v_writelane_b32 v42, s12, 19 -; SI-NEXT: v_writelane_b32 v42, s44, 20 -; SI-NEXT: v_writelane_b32 v42, s97, 21 -; SI-NEXT: v_writelane_b32 v42, s83, 22 -; SI-NEXT: v_writelane_b32 v42, s82, 23 -; SI-NEXT: v_writelane_b32 v42, s98, 24 -; SI-NEXT: v_writelane_b32 v42, s96, 25 -; SI-NEXT: v_writelane_b32 v42, s81, 26 -; SI-NEXT: v_writelane_b32 v42, s9, 27 -; SI-NEXT: v_writelane_b32 v42, s41, 28 -; SI-NEXT: v_writelane_b32 v42, s80, 29 -; SI-NEXT: v_writelane_b32 v42, s7, 30 -; SI-NEXT: v_writelane_b32 v42, s56, 31 -; SI-NEXT: v_writelane_b32 v42, s26, 32 -; SI-NEXT: v_writelane_b32 v42, s15, 33 -; SI-NEXT: v_writelane_b32 v42, s14, 34 -; SI-NEXT: v_writelane_b32 v42, s69, 35 -; SI-NEXT: v_writelane_b32 v42, s71, 36 -; SI-NEXT: v_writelane_b32 v42, s70, 37 -; SI-NEXT: v_writelane_b32 v42, s68, 38 -; SI-NEXT: v_writelane_b32 v42, s74, 39 -; SI-NEXT: v_writelane_b32 v42, s46, 40 -; SI-NEXT: v_writelane_b32 v42, s11, 41 -; SI-NEXT: v_writelane_b32 v42, s10, 42 -; SI-NEXT: v_writelane_b32 v42, s62, 43 -; SI-NEXT: v_writelane_b32 v42, s66, 44 -; SI-NEXT: v_writelane_b32 v42, s58, 45 -; SI-NEXT: v_writelane_b32 v42, s28, 46 -; SI-NEXT: v_writelane_b32 v42, s27, 47 -; SI-NEXT: v_writelane_b32 v42, s78, 48 -; SI-NEXT: v_writelane_b32 v42, s24, 49 +; SI-NEXT: v_readfirstlane_b32 s54, v49 +; SI-NEXT: v_writelane_b32 v43, s54, 15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s64, v50 +; SI-NEXT: v_writelane_b32 v43, s55, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s65, v51 +; SI-NEXT: v_writelane_b32 v43, s64, 17 +; SI-NEXT: v_writelane_b32 v43, s65, 18 +; SI-NEXT: v_writelane_b32 v43, s67, 19 +; SI-NEXT: v_writelane_b32 v43, s68, 20 +; SI-NEXT: v_writelane_b32 v43, s84, 21 +; SI-NEXT: v_writelane_b32 v43, s85, 22 +; SI-NEXT: v_writelane_b32 v43, s86, 23 +; SI-NEXT: v_writelane_b32 v43, s87, 24 +; SI-NEXT: v_writelane_b32 v43, s99, 25 +; SI-NEXT: v_writelane_b32 v43, s98, 26 +; SI-NEXT: v_writelane_b32 v43, s96, 27 +; SI-NEXT: v_writelane_b32 v43, s7, 28 +; SI-NEXT: v_writelane_b32 v43, s12, 29 +; SI-NEXT: v_writelane_b32 v43, s83, 30 +; SI-NEXT: v_writelane_b32 v43, s15, 31 +; SI-NEXT: v_writelane_b32 v43, s9, 32 +; SI-NEXT: v_writelane_b32 v43, s97, 33 +; SI-NEXT: v_writelane_b32 v43, s82, 34 +; SI-NEXT: v_writelane_b32 v43, s80, 35 +; SI-NEXT: v_writelane_b32 v43, s8, 36 +; SI-NEXT: v_writelane_b32 v43, s62, 37 +; SI-NEXT: v_writelane_b32 v43, s41, 38 +; SI-NEXT: v_writelane_b32 v43, s14, 39 +; SI-NEXT: v_writelane_b32 v43, s70, 40 +; SI-NEXT: v_writelane_b32 v43, s81, 41 +; SI-NEXT: v_writelane_b32 v43, s71, 42 +; SI-NEXT: v_writelane_b32 v43, s69, 43 +; SI-NEXT: v_writelane_b32 v43, s46, 44 +; SI-NEXT: v_writelane_b32 v43, s59, 45 +; SI-NEXT: v_writelane_b32 v43, s11, 46 +; SI-NEXT: v_writelane_b32 v43, s10, 47 +; SI-NEXT: v_writelane_b32 v43, s40, 48 +; SI-NEXT: v_writelane_b32 v43, s66, 49 +; SI-NEXT: v_writelane_b32 v43, s29, 50 +; SI-NEXT: v_writelane_b32 v43, s28, 51 +; SI-NEXT: v_writelane_b32 v43, s27, 52 +; SI-NEXT: v_writelane_b32 v43, s45, 53 +; SI-NEXT: v_writelane_b32 v43, s47, 54 +; SI-NEXT: v_writelane_b32 v43, s61, 55 ; SI-NEXT: s_cbranch_scc0 .LBB89_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_readlane_b32 s4, v43, 3 +; SI-NEXT: v_readlane_b32 s4, v44, 7 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_readlane_b32 s5, v43, 2 +; SI-NEXT: v_readlane_b32 s5, v44, 6 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_writelane_b32 v42, s4, 56 -; SI-NEXT: v_readlane_b32 s4, v43, 1 +; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane +; SI-NEXT: v_readlane_b32 s5, v44, 4 +; SI-NEXT: v_writelane_b32 v42, s4, 0 +; SI-NEXT: v_readlane_b32 s4, v44, 5 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_readlane_b32 s5, v43, 0 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_writelane_b32 v42, s4, 57 +; SI-NEXT: v_writelane_b32 v42, s4, 1 ; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s72, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s6, 8 +; SI-NEXT: s_or_b32 s43, s4, s5 ; SI-NEXT: s_and_b32 s5, s22, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_mov_b32 s22, s6 ; SI-NEXT: s_lshl_b32 s6, s23, 24 -; SI-NEXT: v_writelane_b32 v42, s4, 58 ; SI-NEXT: s_or_b32 s4, s6, s5 -; SI-NEXT: s_and_b32 s5, s60, 0xff +; SI-NEXT: s_and_b32 s5, s24, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s25, 24 -; SI-NEXT: v_writelane_b32 v42, s4, 59 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_writelane_b32 v42, s5, 60 -; SI-NEXT: s_and_b32 s5, s79, 0xff +; SI-NEXT: v_writelane_b32 v42, s4, 2 +; SI-NEXT: s_or_b32 s4, s6, s5 +; SI-NEXT: v_readlane_b32 s5, v44, 3 +; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: v_readlane_b32 s6, v44, 2 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s76, 24 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_writelane_b32 v42, s5, 61 -; SI-NEXT: s_and_b32 s5, s77, 0xff -; SI-NEXT: s_lshl_b32 s6, s29, 8 -; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: s_or_b32 s25, s6, s5 +; SI-NEXT: v_readlane_b32 s5, v44, 1 +; SI-NEXT: v_readlane_b32 s6, v44, 0 +; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 8 +; SI-NEXT: s_or_b32 s24, s5, s6 ; SI-NEXT: s_and_b32 s6, s16, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s16, s17, 24 -; SI-NEXT: s_or_b32 s6, s16, s6 -; SI-NEXT: v_writelane_b32 v42, s6, 62 -; SI-NEXT: s_and_b32 s6, s89, 0xff +; SI-NEXT: v_writelane_b32 v42, s4, 3 +; SI-NEXT: s_or_b32 s4, s16, s6 +; SI-NEXT: s_and_b32 s6, s79, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s16, s88, 24 -; SI-NEXT: s_mov_b32 s4, s47 -; SI-NEXT: s_or_b32 s47, s16, s6 +; SI-NEXT: s_lshl_b32 s16, s78, 24 +; SI-NEXT: s_or_b32 s5, s16, s6 ; SI-NEXT: s_and_b32 s6, s18, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s16, s19, 24 -; SI-NEXT: s_or_b32 s25, s16, s6 -; SI-NEXT: s_and_b32 s6, s93, 0xff -; SI-NEXT: s_lshl_b32 s16, s92, 8 +; SI-NEXT: s_or_b32 s75, s16, s6 +; SI-NEXT: s_and_b32 s6, s91, 0xff +; SI-NEXT: s_lshl_b32 s16, s90, 8 ; SI-NEXT: s_or_b32 s6, s6, s16 -; SI-NEXT: s_and_b32 s16, s90, 0xff +; SI-NEXT: s_and_b32 s16, s88, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_lshl_b32 s17, s91, 24 -; SI-NEXT: s_or_b32 s92, s17, s16 -; SI-NEXT: s_and_b32 s16, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s17, s89, 24 +; SI-NEXT: s_or_b32 s78, s17, s16 +; SI-NEXT: s_and_b32 s16, s95, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_lshl_b32 s17, vcc_lo, 24 -; SI-NEXT: s_or_b32 s76, s17, s16 -; SI-NEXT: s_and_b32 s16, s94, 0xff +; SI-NEXT: s_lshl_b32 s17, s94, 24 +; SI-NEXT: s_mov_b32 s23, s21 +; SI-NEXT: s_or_b32 s21, s17, s16 +; SI-NEXT: s_and_b32 s16, s92, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_lshl_b32 s17, s95, 24 -; SI-NEXT: s_or_b32 s91, s17, s16 -; SI-NEXT: s_and_b32 s16, s35, 0xff -; SI-NEXT: s_lshl_b32 s17, s34, 8 +; SI-NEXT: s_lshl_b32 s17, s93, 24 +; SI-NEXT: s_or_b32 s79, s17, s16 +; SI-NEXT: s_and_b32 s16, s31, 0xff +; SI-NEXT: s_lshl_b32 s17, s30, 8 ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: s_and_b32 s17, s30, 0xff +; SI-NEXT: s_and_b32 s17, vcc_lo, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s31, 24 -; SI-NEXT: s_or_b32 s77, s18, s17 -; SI-NEXT: s_and_b32 s17, s39, 0xff +; SI-NEXT: s_lshl_b32 s18, vcc_hi, 24 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_writelane_b32 v43, s17, 56 +; SI-NEXT: s_and_b32 s17, s37, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s38, 24 -; SI-NEXT: s_or_b32 s79, s18, s17 -; SI-NEXT: s_and_b32 s17, s36, 0xff +; SI-NEXT: s_lshl_b32 s18, s36, 24 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_writelane_b32 v43, s17, 58 +; SI-NEXT: s_and_b32 s17, s34, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s37, 24 -; SI-NEXT: s_or_b32 s93, s18, s17 -; SI-NEXT: s_and_b32 s17, s51, 0xff -; SI-NEXT: s_lshl_b32 s18, s50, 8 +; SI-NEXT: s_lshl_b32 s18, s35, 24 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_writelane_b32 v43, s17, 57 +; SI-NEXT: s_and_b32 s17, s49, 0xff +; SI-NEXT: s_lshl_b32 s18, s48, 8 ; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_and_b32 s18, s48, 0xff +; SI-NEXT: s_and_b32 s18, s38, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s49, 24 -; SI-NEXT: s_or_b32 s89, s19, s18 -; SI-NEXT: s_and_b32 s18, s55, 0xff +; SI-NEXT: s_lshl_b32 s19, s39, 24 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: v_writelane_b32 v43, s18, 59 +; SI-NEXT: s_and_b32 s18, s53, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s54, 24 -; SI-NEXT: s_or_b32 s31, s19, s18 -; SI-NEXT: s_and_b32 s18, s52, 0xff +; SI-NEXT: s_lshl_b32 s19, s52, 24 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: v_writelane_b32 v43, s18, 61 +; SI-NEXT: s_and_b32 s18, s50, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s53, 24 -; SI-NEXT: s_or_b32 s94, s19, s18 -; SI-NEXT: s_and_b32 s18, s84, 0xff -; SI-NEXT: s_lshl_b32 s19, s67, 8 +; SI-NEXT: s_lshl_b32 s19, s51, 24 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: v_writelane_b32 v43, s18, 60 +; SI-NEXT: s_and_b32 s18, s65, 0xff +; SI-NEXT: s_lshl_b32 s19, s64, 8 ; SI-NEXT: s_or_b32 s18, s18, s19 -; SI-NEXT: s_and_b32 s19, s64, 0xff +; SI-NEXT: s_and_b32 s19, s54, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s65, 24 -; SI-NEXT: s_or_b32 s60, s20, s19 -; SI-NEXT: s_and_b32 s19, s12, 0xff +; SI-NEXT: s_lshl_b32 s20, s55, 24 +; SI-NEXT: s_or_b32 s19, s20, s19 +; SI-NEXT: v_writelane_b32 v43, s19, 62 +; SI-NEXT: s_and_b32 s19, s87, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s8, 24 -; SI-NEXT: s_or_b32 s8, s20, s19 -; SI-NEXT: s_and_b32 s19, s85, 0xff +; SI-NEXT: s_lshl_b32 s20, s85, 24 +; SI-NEXT: s_or_b32 s19, s20, s19 +; SI-NEXT: v_writelane_b32 v43, s19, 63 +; SI-NEXT: s_and_b32 s19, s67, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s86, 24 -; SI-NEXT: s_or_b32 s12, s20, s19 -; SI-NEXT: s_and_b32 s19, s80, 0xff +; SI-NEXT: s_lshl_b32 s20, s68, 24 +; SI-NEXT: s_or_b32 s95, s20, s19 +; SI-NEXT: s_and_b32 s19, s82, 0xff ; SI-NEXT: s_lshl_b32 s20, s9, 8 ; SI-NEXT: s_or_b32 vcc_lo, s19, s20 -; SI-NEXT: s_and_b32 s19, s44, 0xff +; SI-NEXT: s_and_b32 s19, s99, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s97, 24 -; SI-NEXT: s_or_b32 s9, s20, s19 -; SI-NEXT: s_and_b32 s19, s15, 0xff +; SI-NEXT: s_lshl_b32 s20, s98, 24 +; SI-NEXT: s_or_b32 s30, s20, s19 +; SI-NEXT: s_and_b32 s19, s41, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: s_or_b32 s7, s20, s19 -; SI-NEXT: s_and_b32 s19, s82, 0xff +; SI-NEXT: s_lshl_b32 s20, s80, 24 +; SI-NEXT: s_or_b32 s31, s20, s19 +; SI-NEXT: s_and_b32 s19, s7, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s83, 24 -; SI-NEXT: s_or_b32 s23, s20, s19 -; SI-NEXT: s_and_b32 s19, s26, 0xff -; SI-NEXT: s_lshl_b32 s20, s81, 8 +; SI-NEXT: s_lshl_b32 s20, s96, 24 +; SI-NEXT: s_or_b32 s34, s20, s19 +; SI-NEXT: s_and_b32 s19, s62, 0xff +; SI-NEXT: s_lshl_b32 s20, s15, 8 ; SI-NEXT: s_or_b32 vcc_hi, s19, s20 -; SI-NEXT: s_and_b32 s19, s99, 0xff -; SI-NEXT: v_writelane_b32 v42, s9, 50 +; SI-NEXT: s_and_b32 s19, s86, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s87, 24 -; SI-NEXT: v_writelane_b32 v42, s7, 51 -; SI-NEXT: s_or_b32 s7, s20, s19 -; SI-NEXT: s_and_b32 s19, s56, 0xff +; SI-NEXT: s_lshl_b32 s20, s84, 24 +; SI-NEXT: s_or_b32 s35, s20, s19 +; SI-NEXT: s_and_b32 s19, s8, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s41, 24 -; SI-NEXT: v_writelane_b32 v42, s7, 52 -; SI-NEXT: s_or_b32 s7, s20, s19 -; SI-NEXT: s_and_b32 s19, s98, 0xff +; SI-NEXT: s_lshl_b32 s20, s97, 24 +; SI-NEXT: s_or_b32 s36, s20, s19 +; SI-NEXT: s_and_b32 s19, s12, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s96, 24 -; SI-NEXT: v_writelane_b32 v42, s7, 54 -; SI-NEXT: s_or_b32 s7, s20, s19 -; SI-NEXT: s_and_b32 s19, s46, 0xff -; SI-NEXT: s_lshl_b32 s20, s74, 8 +; SI-NEXT: s_lshl_b32 s20, s83, 24 +; SI-NEXT: s_or_b32 s37, s20, s19 +; SI-NEXT: s_and_b32 s19, s59, 0xff +; SI-NEXT: s_lshl_b32 s20, s46, 8 ; SI-NEXT: s_or_b32 s84, s19, s20 -; SI-NEXT: s_and_b32 s19, s71, 0xff +; SI-NEXT: s_and_b32 s19, s81, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s70, 24 -; SI-NEXT: s_or_b32 s72, s20, s19 +; SI-NEXT: s_lshl_b32 s20, s71, 24 +; SI-NEXT: s_or_b32 s38, s20, s19 ; SI-NEXT: s_and_b32 s19, s11, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s68, 24 -; SI-NEXT: v_writelane_b32 v42, s7, 53 -; SI-NEXT: s_or_b32 s7, s20, s19 +; SI-NEXT: s_lshl_b32 s20, s69, 24 +; SI-NEXT: s_or_b32 s39, s20, s19 ; SI-NEXT: s_and_b32 s19, s14, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s69, 24 -; SI-NEXT: s_or_b32 s9, s20, s19 -; SI-NEXT: s_and_b32 s19, s58, 0xff +; SI-NEXT: s_lshl_b32 s20, s70, 24 +; SI-NEXT: s_or_b32 s48, s20, s19 +; SI-NEXT: s_and_b32 s19, s29, 0xff ; SI-NEXT: s_lshl_b32 s20, s66, 8 ; SI-NEXT: s_or_b32 s85, s19, s20 ; SI-NEXT: s_and_b32 s19, s10, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s62, 24 +; SI-NEXT: s_lshl_b32 s20, s40, 24 ; SI-NEXT: s_or_b32 s49, s20, s19 ; SI-NEXT: s_and_b32 s19, s27, 0xff -; SI-NEXT: v_writelane_b32 v42, s9, 55 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s28, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 33 +; SI-NEXT: v_readlane_b32 s7, v44, 35 ; SI-NEXT: s_or_b32 s50, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 32 +; SI-NEXT: s_and_b32 s19, s7, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 31 +; SI-NEXT: s_lshl_b32 s20, s44, 24 ; SI-NEXT: s_or_b32 s51, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 30 -; SI-NEXT: s_lshl_b32 s20, s9, 8 -; SI-NEXT: v_readlane_b32 s9, v43, 29 +; SI-NEXT: s_and_b32 s19, s23, 0xff +; SI-NEXT: s_lshl_b32 s20, s73, 8 ; SI-NEXT: s_or_b32 s86, s19, s20 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 28 +; SI-NEXT: s_and_b32 s19, s72, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 27 +; SI-NEXT: s_lshl_b32 s20, s13, 24 ; SI-NEXT: s_or_b32 s52, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 26 +; SI-NEXT: s_and_b32 s19, s42, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 25 +; SI-NEXT: s_lshl_b32 s20, s60, 24 ; SI-NEXT: s_or_b32 s53, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 24 +; SI-NEXT: s_and_b32 s19, s45, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 23 +; SI-NEXT: s_lshl_b32 s20, s47, 24 ; SI-NEXT: s_or_b32 s54, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 22 -; SI-NEXT: s_lshl_b32 s20, s9, 8 -; SI-NEXT: v_readlane_b32 s9, v43, 21 +; SI-NEXT: s_and_b32 s19, s76, 0xff +; SI-NEXT: s_lshl_b32 s20, s74, 8 ; SI-NEXT: s_or_b32 s87, s19, s20 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 20 +; SI-NEXT: s_and_b32 s19, s61, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 19 +; SI-NEXT: s_lshl_b32 s20, s56, 24 ; SI-NEXT: s_or_b32 s55, s20, s19 -; SI-NEXT: s_mov_b32 s58, s9 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 18 +; SI-NEXT: s_and_b32 s19, s57, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 +; SI-NEXT: s_lshl_b32 s20, s63, 24 ; SI-NEXT: s_or_b32 s64, s20, s19 -; SI-NEXT: s_and_b32 s19, s78, 0xff +; SI-NEXT: s_and_b32 s19, s77, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s24, 24 +; SI-NEXT: s_lshl_b32 s20, s58, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 34 ; SI-NEXT: s_or_b32 s65, s20, s19 -; SI-NEXT: s_and_b32 s19, s4, 0xff -; SI-NEXT: s_lshl_b32 s20, s45, 8 +; SI-NEXT: s_and_b32 s19, s26, 0xff +; SI-NEXT: s_mov_b32 s42, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v44, 33 +; SI-NEXT: s_mov_b32 s94, s26 ; SI-NEXT: s_or_b32 s26, s19, s20 -; SI-NEXT: s_and_b32 s19, s13, 0xff +; SI-NEXT: s_mov_b32 s47, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 32 +; SI-NEXT: s_mov_b32 s92, s56 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s57, 24 +; SI-NEXT: s_mov_b32 s56, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 31 ; SI-NEXT: s_or_b32 s66, s20, s19 -; SI-NEXT: s_and_b32 s19, s21, 0xff +; SI-NEXT: s_mov_b32 s61, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 30 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s73, 24 +; SI-NEXT: s_mov_b32 s60, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 29 ; SI-NEXT: s_or_b32 s67, s20, s19 -; SI-NEXT: s_and_b32 s19, s42, 0xff -; SI-NEXT: v_readlane_b32 s88, v43, 17 +; SI-NEXT: s_mov_b32 s68, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 28 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s59, 24 -; SI-NEXT: s_or_b32 s68, s20, s19 -; SI-NEXT: s_and_b32 s19, s63, 0xff -; SI-NEXT: s_lshl_b32 s20, s88, 8 +; SI-NEXT: s_mov_b32 s59, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 27 +; SI-NEXT: s_or_b32 s45, s20, s19 +; SI-NEXT: s_mov_b32 s46, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 26 +; SI-NEXT: s_mov_b32 s69, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v44, 25 ; SI-NEXT: s_or_b32 s27, s19, s20 -; SI-NEXT: s_and_b32 s19, s40, 0xff +; SI-NEXT: s_mov_b32 s40, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 24 +; SI-NEXT: s_mov_b32 s89, s76 +; SI-NEXT: s_mov_b32 s76, s58 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s43, 24 -; SI-NEXT: s_or_b32 s69, s20, s19 -; SI-NEXT: s_and_b32 s19, s61, 0xff -; SI-NEXT: s_mov_b32 s39, s57 -; SI-NEXT: s_mov_b32 s57, s7 +; SI-NEXT: s_mov_b32 s58, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 23 +; SI-NEXT: s_mov_b32 s93, s74 +; SI-NEXT: s_mov_b32 s88, s57 +; SI-NEXT: s_or_b32 s57, s20, s19 +; SI-NEXT: s_mov_b32 s74, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 22 +; SI-NEXT: s_mov_b32 s90, s63 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s75, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 16 +; SI-NEXT: s_mov_b32 s63, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 21 ; SI-NEXT: s_or_b32 s70, s20, s19 -; SI-NEXT: s_mov_b32 s10, s7 +; SI-NEXT: s_mov_b32 s71, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 15 +; SI-NEXT: v_readlane_b32 s7, v44, 20 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_mov_b32 s71, s7 +; SI-NEXT: s_mov_b32 s81, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 14 -; SI-NEXT: s_or_b32 s62, s20, s19 -; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 13 +; SI-NEXT: v_readlane_b32 s7, v44, 19 +; SI-NEXT: s_or_b32 s13, s20, s19 ; SI-NEXT: s_mov_b32 s41, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 18 +; SI-NEXT: s_mov_b32 s14, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 12 +; SI-NEXT: v_readlane_b32 s7, v44, 17 ; SI-NEXT: s_or_b32 s29, s19, s20 -; SI-NEXT: s_mov_b32 s14, s7 +; SI-NEXT: s_mov_b32 s10, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 11 +; SI-NEXT: v_readlane_b32 s7, v44, 16 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_mov_b32 s9, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 10 +; SI-NEXT: v_readlane_b32 s7, v44, 15 ; SI-NEXT: s_or_b32 s80, s20, s19 -; SI-NEXT: s_mov_b32 s56, s7 +; SI-NEXT: s_mov_b32 s8, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 9 +; SI-NEXT: v_readlane_b32 s7, v44, 14 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_mov_b32 s81, s7 +; SI-NEXT: s_mov_b32 s15, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 8 -; SI-NEXT: s_or_b32 s11, s20, s19 -; SI-NEXT: s_mov_b32 s82, s7 +; SI-NEXT: v_readlane_b32 s7, v44, 13 +; SI-NEXT: s_mov_b32 s72, s25 +; SI-NEXT: s_or_b32 s25, s20, s19 +; SI-NEXT: s_mov_b32 s83, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 7 +; SI-NEXT: v_readlane_b32 s7, v44, 12 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_mov_b32 s96, s7 +; SI-NEXT: s_mov_b32 s97, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 6 -; SI-NEXT: s_mov_b32 s36, s63 -; SI-NEXT: s_mov_b32 s63, s93 -; SI-NEXT: s_mov_b32 s93, s61 -; SI-NEXT: s_mov_b32 s61, s91 -; SI-NEXT: s_mov_b32 s91, s75 -; SI-NEXT: s_mov_b32 s75, s92 -; SI-NEXT: s_or_b32 s92, s20, s19 -; SI-NEXT: s_mov_b32 s98, s7 -; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 5 +; SI-NEXT: v_readlane_b32 s7, v44, 11 +; SI-NEXT: s_or_b32 s82, s20, s19 ; SI-NEXT: s_mov_b32 s44, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 10 ; SI-NEXT: s_lshl_b32 s20, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 4 -; SI-NEXT: s_mov_b32 s48, s13 -; SI-NEXT: s_mov_b32 s13, s94 -; SI-NEXT: s_mov_b32 s94, s21 +; SI-NEXT: v_readlane_b32 s12, v44, 9 +; SI-NEXT: v_readlane_b32 s22, v44, 8 +; SI-NEXT: s_mov_b32 s91, s77 +; SI-NEXT: s_mov_b32 s77, s21 ; SI-NEXT: s_or_b32 s21, s19, s20 -; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: s_mov_b32 s95, s4 +; SI-NEXT: s_and_b32 s19, s12, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s22, 24 -; SI-NEXT: v_readlane_b32 s4, v42, 58 -; SI-NEXT: s_mov_b32 s46, s45 -; SI-NEXT: s_mov_b32 s34, s73 -; SI-NEXT: s_mov_b32 s73, s12 -; SI-NEXT: s_mov_b32 s37, s42 -; SI-NEXT: s_mov_b32 s38, s59 -; SI-NEXT: s_mov_b32 s59, s8 -; SI-NEXT: s_mov_b32 s30, s88 -; SI-NEXT: s_mov_b32 s88, s31 -; SI-NEXT: s_mov_b32 s78, s40 -; SI-NEXT: s_mov_b32 s31, s43 -; SI-NEXT: s_mov_b32 s12, s7 -; SI-NEXT: s_mov_b32 s7, s22 -; SI-NEXT: s_or_b32 s83, s20, s19 -; SI-NEXT: s_lshl_b32 s20, s4, 16 -; SI-NEXT: s_lshl_b32 s74, s5, 16 +; SI-NEXT: s_mov_b32 s73, s4 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s7, s12 +; SI-NEXT: s_mov_b32 s62, s22 +; SI-NEXT: s_or_b32 s28, s20, s19 +; SI-NEXT: s_lshl_b32 s20, s43, 16 +; SI-NEXT: s_lshl_b32 s23, s24, 16 ; SI-NEXT: s_lshl_b32 s22, s6, 16 ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s19, s17, 16 @@ -150200,50 +150245,48 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_lshl_b32 s17, vcc_lo, 16 ; SI-NEXT: s_lshl_b32 s6, vcc_hi, 16 ; SI-NEXT: s_lshl_b32 s99, s84, 16 -; SI-NEXT: s_lshl_b32 s8, s85, 16 -; SI-NEXT: s_lshl_b32 s97, s86, 16 -; SI-NEXT: s_lshl_b32 s28, s87, 16 +; SI-NEXT: s_lshl_b32 s98, s85, 16 +; SI-NEXT: s_lshl_b32 s12, s86, 16 +; SI-NEXT: s_lshl_b32 s96, s87, 16 ; SI-NEXT: s_lshl_b32 s87, s26, 16 -; SI-NEXT: v_readlane_b32 s26, v42, 56 +; SI-NEXT: v_readlane_b32 s26, v42, 0 ; SI-NEXT: s_lshl_b32 s86, s27, 16 -; SI-NEXT: v_readlane_b32 s27, v42, 57 -; SI-NEXT: v_readlane_b32 s35, v42, 61 +; SI-NEXT: v_readlane_b32 s27, v42, 1 ; SI-NEXT: s_lshl_b32 s85, s29, 16 -; SI-NEXT: v_readlane_b32 s29, v42, 60 -; SI-NEXT: v_readlane_b32 s24, v42, 59 -; SI-NEXT: v_readlane_b32 s90, v42, 62 +; SI-NEXT: v_readlane_b32 s29, v42, 3 +; SI-NEXT: v_readlane_b32 s24, v42, 2 ; SI-NEXT: s_lshl_b32 s84, s21, 16 -; SI-NEXT: s_mov_b32 s21, s47 +; SI-NEXT: s_mov_b32 s21, s5 ; SI-NEXT: s_cbranch_execnz .LBB89_3 ; SI-NEXT: .LBB89_2: ; %cmp.true -; SI-NEXT: s_add_i32 s4, s98, 3 +; SI-NEXT: s_add_i32 s4, s44, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s44, 8 -; SI-NEXT: s_add_i32 s6, s12, 3 +; SI-NEXT: s_lshl_b32 s5, s11, 8 +; SI-NEXT: s_add_i32 s6, s7, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s5, s7, 24 +; SI-NEXT: s_lshl_b32 s5, s62, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s5, s56, 3 +; SI-NEXT: s_add_i32 s5, s8, 3 ; SI-NEXT: s_and_b32 s5, s5, 0xff -; SI-NEXT: s_lshl_b32 s6, s81, 8 -; SI-NEXT: s_add_i32 s16, s82, 3 +; SI-NEXT: s_lshl_b32 s6, s15, 8 +; SI-NEXT: s_add_i32 s16, s83, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s6, s96, 24 +; SI-NEXT: s_lshl_b32 s6, s97, 24 ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_addk_i32 s5, 0x300 ; SI-NEXT: s_or_b32 s6, s6, s16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_add_i32 s6, s15, 3 +; SI-NEXT: s_add_i32 s6, s41, 3 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s16, s41, 8 -; SI-NEXT: s_add_i32 s17, s14, 3 +; SI-NEXT: s_lshl_b32 s16, s14, 8 +; SI-NEXT: s_add_i32 s17, s10, 3 ; SI-NEXT: s_or_b32 s6, s16, s6 ; SI-NEXT: s_and_b32 s17, s17, 0xff ; SI-NEXT: s_lshl_b32 s16, s9, 24 @@ -150252,162 +150295,156 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s6, s16, s6 -; SI-NEXT: s_add_i32 s16, s93, 3 +; SI-NEXT: s_add_i32 s16, s74, 3 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s17, s91, 8 -; SI-NEXT: s_add_i32 s18, s10, 3 +; SI-NEXT: s_lshl_b32 s17, s63, 8 +; SI-NEXT: s_add_i32 s18, s71, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_lshl_b32 s17, s71, 24 +; SI-NEXT: s_lshl_b32 s17, s81, 24 ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s17, s36, 3 +; SI-NEXT: s_add_i32 s17, s46, 3 ; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: s_lshl_b32 s18, s30, 8 -; SI-NEXT: s_add_i32 s19, s78, 3 +; SI-NEXT: s_lshl_b32 s18, s69, 8 +; SI-NEXT: s_add_i32 s19, s40, 3 ; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: s_and_b32 s19, s19, 0xff -; SI-NEXT: s_lshl_b32 s18, s31, 24 +; SI-NEXT: s_lshl_b32 s18, s58, 24 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_addk_i32 s17, 0x300 ; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: s_and_b32 s17, s17, 0xffff ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_add_i32 s18, s94, 3 +; SI-NEXT: s_add_i32 s18, s61, 3 ; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_lshl_b32 s19, s34, 8 -; SI-NEXT: s_add_i32 s20, s37, 3 +; SI-NEXT: s_lshl_b32 s19, s60, 8 +; SI-NEXT: s_add_i32 s20, s68, 3 ; SI-NEXT: s_or_b32 s18, s19, s18 ; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s19, s38, 24 +; SI-NEXT: s_lshl_b32 s19, s59, 24 ; SI-NEXT: s_lshl_b32 s20, s20, 16 ; SI-NEXT: s_addk_i32 s18, 0x300 ; SI-NEXT: s_or_b32 s19, s19, s20 ; SI-NEXT: s_and_b32 s18, s18, 0xffff ; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_add_i32 s19, s95, 3 +; SI-NEXT: s_add_i32 s19, s94, 3 ; SI-NEXT: s_and_b32 s19, s19, 0xff -; SI-NEXT: s_lshl_b32 s20, s46, 8 -; SI-NEXT: s_add_i32 s22, s48, 3 +; SI-NEXT: s_lshl_b32 s20, s42, 8 +; SI-NEXT: s_add_i32 s22, s47, 3 ; SI-NEXT: s_or_b32 s19, s20, s19 ; SI-NEXT: s_and_b32 s22, s22, 0xff -; SI-NEXT: s_lshl_b32 s20, s39, 24 +; SI-NEXT: s_lshl_b32 s20, s56, 24 ; SI-NEXT: s_lshl_b32 s22, s22, 16 ; SI-NEXT: s_addk_i32 s19, 0x300 ; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: s_and_b32 s19, s19, 0xffff ; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: s_add_i32 s20, s58, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 18 +; SI-NEXT: s_add_i32 s20, s88, 3 ; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s22, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 49 +; SI-NEXT: s_lshl_b32 s22, s90, 8 +; SI-NEXT: s_add_i32 s23, s91, 3 ; SI-NEXT: s_or_b32 s20, s22, s20 -; SI-NEXT: s_lshl_b32 s22, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 48 -; SI-NEXT: s_add_i32 s23, s7, 3 ; SI-NEXT: s_and_b32 s23, s23, 0xff +; SI-NEXT: s_lshl_b32 s22, s76, 24 ; SI-NEXT: s_lshl_b32 s23, s23, 16 ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: v_readlane_b32 s7, v43, 23 ; SI-NEXT: s_or_b32 s20, s22, s20 -; SI-NEXT: s_add_i32 s22, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 22 +; SI-NEXT: s_add_i32 s22, s89, 3 +; SI-NEXT: v_readlane_b32 s7, v43, 55 ; SI-NEXT: s_and_b32 s22, s22, 0xff -; SI-NEXT: s_lshl_b32 s23, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 20 -; SI-NEXT: s_or_b32 s22, s23, s22 -; SI-NEXT: s_lshl_b32 s23, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 21 +; SI-NEXT: s_lshl_b32 s23, s93, 8 ; SI-NEXT: s_add_i32 s60, s7, 3 +; SI-NEXT: s_or_b32 s22, s23, s22 ; SI-NEXT: s_and_b32 s60, s60, 0xff +; SI-NEXT: s_lshl_b32 s23, s92, 24 ; SI-NEXT: s_lshl_b32 s60, s60, 16 ; SI-NEXT: s_addk_i32 s22, 0x300 ; SI-NEXT: s_or_b32 s23, s23, s60 ; SI-NEXT: s_and_b32 s22, s22, 0xffff -; SI-NEXT: v_readlane_b32 s7, v43, 27 +; SI-NEXT: v_readlane_b32 s7, v44, 47 ; SI-NEXT: s_or_b32 s22, s23, s22 ; SI-NEXT: s_add_i32 s23, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 26 +; SI-NEXT: v_readlane_b32 s7, v44, 45 ; SI-NEXT: s_and_b32 s23, s23, 0xff ; SI-NEXT: s_lshl_b32 s60, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 24 +; SI-NEXT: v_readlane_b32 s7, v43, 54 ; SI-NEXT: s_or_b32 s23, s60, s23 ; SI-NEXT: s_lshl_b32 s60, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 25 +; SI-NEXT: v_readlane_b32 s7, v43, 53 ; SI-NEXT: s_add_i32 s61, s7, 3 ; SI-NEXT: s_and_b32 s61, s61, 0xff ; SI-NEXT: s_lshl_b32 s61, s61, 16 ; SI-NEXT: s_addk_i32 s23, 0x300 ; SI-NEXT: s_or_b32 s60, s60, s61 ; SI-NEXT: s_and_b32 s23, s23, 0xffff -; SI-NEXT: v_readlane_b32 s7, v43, 31 +; SI-NEXT: v_readlane_b32 s7, v44, 48 ; SI-NEXT: s_or_b32 s23, s60, s23 ; SI-NEXT: s_add_i32 s60, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 30 +; SI-NEXT: v_readlane_b32 s7, v44, 46 ; SI-NEXT: s_and_b32 s60, s60, 0xff ; SI-NEXT: s_lshl_b32 s61, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 28 +; SI-NEXT: v_readlane_b32 s7, v44, 44 ; SI-NEXT: s_or_b32 s60, s61, s60 ; SI-NEXT: s_lshl_b32 s61, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 29 +; SI-NEXT: v_readlane_b32 s7, v44, 43 ; SI-NEXT: s_add_i32 s62, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 47 +; SI-NEXT: v_readlane_b32 s7, v43, 52 ; SI-NEXT: s_and_b32 s62, s62, 0xff ; SI-NEXT: s_add_i32 s59, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 46 +; SI-NEXT: v_readlane_b32 s7, v43, 51 ; SI-NEXT: s_lshl_b32 s62, s62, 16 ; SI-NEXT: s_addk_i32 s60, 0x300 ; SI-NEXT: s_and_b32 s59, s59, 0xff ; SI-NEXT: s_lshl_b32 s58, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 32 +; SI-NEXT: v_readlane_b32 s7, v44, 42 ; SI-NEXT: s_or_b32 s61, s61, s62 ; SI-NEXT: s_and_b32 s60, s60, 0xffff ; SI-NEXT: s_or_b32 s58, s58, s59 ; SI-NEXT: s_lshl_b32 s59, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 33 +; SI-NEXT: v_readlane_b32 s7, v44, 35 ; SI-NEXT: s_or_b32 s60, s61, s60 ; SI-NEXT: s_add_i32 s61, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 45 +; SI-NEXT: v_readlane_b32 s7, v43, 50 ; SI-NEXT: s_add_i32 s57, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 44 +; SI-NEXT: v_readlane_b32 s7, v43, 49 ; SI-NEXT: s_lshl_b32 s56, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 43 +; SI-NEXT: v_readlane_b32 s7, v43, 48 ; SI-NEXT: s_lshl_b32 s47, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 42 +; SI-NEXT: v_readlane_b32 s7, v43, 47 ; SI-NEXT: s_add_i32 s46, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 41 +; SI-NEXT: v_readlane_b32 s7, v43, 46 ; SI-NEXT: s_add_i32 s45, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 38 +; SI-NEXT: v_readlane_b32 s7, v43, 43 ; SI-NEXT: s_lshl_b32 s42, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 35 +; SI-NEXT: v_readlane_b32 s7, v43, 40 ; SI-NEXT: s_lshl_b32 s15, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 34 +; SI-NEXT: v_readlane_b32 s7, v43, 39 ; SI-NEXT: s_and_b32 s45, s45, 0xff ; SI-NEXT: s_add_i32 s14, s7, 3 ; SI-NEXT: s_or_b32 s42, s42, s45 ; SI-NEXT: s_and_b32 s14, s14, 0xff ; SI-NEXT: s_lshl_b32 s14, s14, 16 ; SI-NEXT: s_addk_i32 s42, 0x300 -; SI-NEXT: v_readlane_b32 s7, v42, 40 +; SI-NEXT: v_readlane_b32 s7, v43, 45 ; SI-NEXT: s_and_b32 s57, s57, 0xff ; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: s_and_b32 s15, s42, 0xffff ; SI-NEXT: s_add_i32 s44, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 39 +; SI-NEXT: v_readlane_b32 s7, v43, 44 ; SI-NEXT: s_or_b32 s56, s56, s57 ; SI-NEXT: s_or_b32 s57, s14, s15 ; SI-NEXT: s_and_b32 s14, s44, 0xff ; SI-NEXT: s_lshl_b32 s15, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 37 +; SI-NEXT: v_readlane_b32 s7, v43, 42 ; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: s_lshl_b32 s15, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 36 +; SI-NEXT: v_readlane_b32 s7, v43, 41 ; SI-NEXT: s_add_i32 s40, s7, 3 ; SI-NEXT: s_and_b32 s61, s61, 0xff ; SI-NEXT: s_and_b32 s40, s40, 0xff @@ -150422,15 +150459,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s58, s59, s58 ; SI-NEXT: s_or_b32 s59, s15, s14 ; SI-NEXT: s_add_i32 s14, s6, 0x3000000 -; SI-NEXT: v_readlane_b32 s6, v42, 31 +; SI-NEXT: v_readlane_b32 s6, v43, 36 ; SI-NEXT: s_add_i32 s11, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 28 +; SI-NEXT: v_readlane_b32 s7, v43, 33 ; SI-NEXT: s_and_b32 s6, s11, 0xff ; SI-NEXT: s_lshl_b32 s8, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 25 +; SI-NEXT: v_readlane_b32 s7, v43, 30 ; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: s_lshl_b32 s8, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 24 +; SI-NEXT: v_readlane_b32 s7, v43, 29 ; SI-NEXT: s_add_i32 s24, s7, 3 ; SI-NEXT: s_and_b32 s11, s24, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 @@ -150438,47 +150475,47 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s8, s8, s11 ; SI-NEXT: s_or_b32 s8, s8, s6 -; SI-NEXT: v_readlane_b32 s6, v42, 32 +; SI-NEXT: v_readlane_b32 s6, v43, 37 ; SI-NEXT: s_add_i32 s12, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 26 +; SI-NEXT: v_readlane_b32 s7, v43, 31 ; SI-NEXT: s_and_b32 s6, s12, 0xff ; SI-NEXT: s_lshl_b32 s11, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 16 +; SI-NEXT: v_readlane_b32 s7, v43, 21 ; SI-NEXT: s_or_b32 s6, s11, s6 ; SI-NEXT: s_lshl_b32 s11, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 18 +; SI-NEXT: v_readlane_b32 s7, v43, 23 ; SI-NEXT: s_add_i32 s12, s7, 3 ; SI-NEXT: s_and_b32 s12, s12, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: v_readlane_b32 s7, v42, 33 +; SI-NEXT: v_readlane_b32 s7, v43, 38 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s11, s11, s12 ; SI-NEXT: s_add_i32 s13, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 30 +; SI-NEXT: v_readlane_b32 s7, v43, 35 ; SI-NEXT: s_or_b32 s6, s11, s6 ; SI-NEXT: s_and_b32 s11, s13, 0xff ; SI-NEXT: s_lshl_b32 s10, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 22 +; SI-NEXT: v_readlane_b32 s7, v43, 27 ; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: s_lshl_b32 s11, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 23 +; SI-NEXT: v_readlane_b32 s7, v43, 28 ; SI-NEXT: s_add_i32 s25, s7, 3 ; SI-NEXT: s_and_b32 s12, s25, 0xff ; SI-NEXT: s_addk_i32 s10, 0x300 ; SI-NEXT: s_lshl_b32 s12, s12, 16 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s11, s11, s12 -; SI-NEXT: v_readlane_b32 s7, v42, 29 +; SI-NEXT: v_readlane_b32 s7, v43, 34 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: s_add_i32 s9, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 27 -; SI-NEXT: v_readlane_b32 s11, v42, 20 +; SI-NEXT: v_readlane_b32 s7, v43, 32 +; SI-NEXT: v_readlane_b32 s11, v43, 25 ; SI-NEXT: s_and_b32 s9, s9, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: v_readlane_b32 s9, v42, 21 +; SI-NEXT: v_readlane_b32 s9, v43, 26 ; SI-NEXT: s_and_b32 s11, s11, 0xff ; SI-NEXT: s_addk_i32 s7, 0x300 ; SI-NEXT: s_lshl_b32 s9, s9, 24 @@ -150486,15 +150523,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: s_or_b32 s9, s9, s11 ; SI-NEXT: s_or_b32 s7, s9, s7 -; SI-NEXT: v_readlane_b32 s9, v42, 19 +; SI-NEXT: v_readlane_b32 s9, v43, 24 ; SI-NEXT: s_add_i32 s21, s9, 3 -; SI-NEXT: v_readlane_b32 s11, v42, 17 -; SI-NEXT: v_readlane_b32 s12, v42, 14 +; SI-NEXT: v_readlane_b32 s11, v43, 22 +; SI-NEXT: v_readlane_b32 s12, v43, 19 ; SI-NEXT: s_and_b32 s9, s21, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 8 ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: v_readlane_b32 s11, v42, 15 +; SI-NEXT: v_readlane_b32 s11, v43, 20 ; SI-NEXT: s_and_b32 s12, s12, 0xff ; SI-NEXT: s_addk_i32 s9, 0x300 ; SI-NEXT: s_lshl_b32 s11, s11, 24 @@ -150502,15 +150539,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s9, s9, 0xffff ; SI-NEXT: s_or_b32 s11, s11, s12 ; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: v_readlane_b32 s11, v42, 13 +; SI-NEXT: v_readlane_b32 s11, v43, 18 ; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: v_readlane_b32 s12, v42, 12 -; SI-NEXT: v_readlane_b32 s13, v42, 10 +; SI-NEXT: v_readlane_b32 s12, v43, 17 +; SI-NEXT: v_readlane_b32 s13, v43, 15 ; SI-NEXT: s_and_b32 s11, s11, 0xff ; SI-NEXT: s_lshl_b32 s12, s12, 8 ; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: v_readlane_b32 s12, v42, 11 +; SI-NEXT: v_readlane_b32 s12, v43, 16 ; SI-NEXT: s_and_b32 s13, s13, 0xff ; SI-NEXT: s_addk_i32 s11, 0x300 ; SI-NEXT: s_lshl_b32 s12, s12, 24 @@ -150518,16 +150555,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s11, s11, 0xffff ; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: v_readlane_b32 s12, v42, 9 +; SI-NEXT: v_readlane_b32 s12, v43, 14 ; SI-NEXT: s_add_i32 s15, s16, 0x3000000 ; SI-NEXT: s_add_i32 s12, s12, 3 -; SI-NEXT: v_readlane_b32 s13, v42, 8 -; SI-NEXT: v_readlane_b32 s16, v42, 6 +; SI-NEXT: v_readlane_b32 s13, v43, 13 +; SI-NEXT: v_readlane_b32 s16, v43, 11 ; SI-NEXT: s_and_b32 s12, s12, 0xff ; SI-NEXT: s_lshl_b32 s13, s13, 8 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_readlane_b32 s13, v42, 7 +; SI-NEXT: v_readlane_b32 s13, v43, 12 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_addk_i32 s12, 0x300 ; SI-NEXT: s_lshl_b32 s13, s13, 24 @@ -150535,16 +150572,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: s_or_b32 s13, s13, s16 ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_readlane_b32 s13, v42, 5 +; SI-NEXT: v_readlane_b32 s13, v43, 10 ; SI-NEXT: s_add_i32 s40, s17, 0x3000000 ; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: v_readlane_b32 s16, v42, 4 -; SI-NEXT: v_readlane_b32 s17, v42, 2 +; SI-NEXT: v_readlane_b32 s16, v43, 9 +; SI-NEXT: v_readlane_b32 s17, v43, 7 ; SI-NEXT: s_and_b32 s13, s13, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 8 ; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s13, s16, s13 -; SI-NEXT: v_readlane_b32 s16, v42, 3 +; SI-NEXT: v_readlane_b32 s16, v43, 8 ; SI-NEXT: s_and_b32 s17, s17, 0xff ; SI-NEXT: s_addk_i32 s13, 0x300 ; SI-NEXT: s_lshl_b32 s16, s16, 24 @@ -150552,16 +150589,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s13, s13, 0xffff ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_or_b32 s13, s16, s13 -; SI-NEXT: v_readlane_b32 s16, v42, 1 +; SI-NEXT: v_readlane_b32 s16, v43, 6 ; SI-NEXT: s_add_i32 s41, s18, 0x3000000 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 0 -; SI-NEXT: v_readlane_b32 s18, v43, 62 +; SI-NEXT: v_readlane_b32 s17, v43, 5 +; SI-NEXT: v_readlane_b32 s18, v43, 3 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 63 +; SI-NEXT: v_readlane_b32 s17, v43, 4 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -150570,16 +150607,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s17, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v43, 61 +; SI-NEXT: v_readlane_b32 s16, v43, 2 ; SI-NEXT: s_add_i32 s42, s19, 0x3000000 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s18, v43, 60 -; SI-NEXT: v_readlane_b32 s19, v43, 58 +; SI-NEXT: v_readlane_b32 s18, v43, 1 +; SI-NEXT: v_readlane_b32 s19, v44, 63 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 8 ; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s16, s18, s16 -; SI-NEXT: v_readlane_b32 s18, v43, 59 +; SI-NEXT: v_readlane_b32 s18, v43, 0 ; SI-NEXT: s_and_b32 s19, s19, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s18, s18, 24 @@ -150587,16 +150624,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: s_or_b32 s16, s18, s16 -; SI-NEXT: v_readlane_b32 s18, v43, 57 +; SI-NEXT: v_readlane_b32 s18, v44, 62 ; SI-NEXT: s_add_i32 s43, s20, 0x3000000 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_readlane_b32 s19, v43, 56 -; SI-NEXT: v_readlane_b32 s20, v43, 54 +; SI-NEXT: v_readlane_b32 s19, v44, 61 +; SI-NEXT: v_readlane_b32 s20, v44, 59 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 8 ; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_readlane_b32 s19, v43, 55 +; SI-NEXT: v_readlane_b32 s19, v44, 60 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_addk_i32 s18, 0x300 ; SI-NEXT: s_lshl_b32 s19, s19, 24 @@ -150604,15 +150641,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s18, s18, 0xffff ; SI-NEXT: s_or_b32 s19, s19, s20 ; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_readlane_b32 s19, v43, 53 +; SI-NEXT: v_readlane_b32 s19, v44, 58 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_readlane_b32 s20, v43, 52 -; SI-NEXT: v_readlane_b32 s21, v43, 50 +; SI-NEXT: v_readlane_b32 s20, v44, 57 +; SI-NEXT: v_readlane_b32 s21, v44, 55 ; SI-NEXT: s_and_b32 s19, s19, 0xff ; SI-NEXT: s_lshl_b32 s20, s20, 8 ; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: v_readlane_b32 s20, v43, 51 +; SI-NEXT: v_readlane_b32 s20, v44, 56 ; SI-NEXT: s_and_b32 s21, s21, 0xff ; SI-NEXT: s_addk_i32 s19, 0x300 ; SI-NEXT: s_lshl_b32 s20, s20, 24 @@ -150620,16 +150657,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s19, s19, 0xffff ; SI-NEXT: s_or_b32 s20, s20, s21 ; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: v_readlane_b32 s20, v43, 49 +; SI-NEXT: v_readlane_b32 s20, v44, 54 ; SI-NEXT: s_add_i32 s44, s22, 0x3000000 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s21, v43, 48 -; SI-NEXT: v_readlane_b32 s22, v43, 46 +; SI-NEXT: v_readlane_b32 s21, v44, 53 +; SI-NEXT: v_readlane_b32 s22, v44, 51 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s21, s21, 8 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s20, s21, s20 -; SI-NEXT: v_readlane_b32 s21, v43, 47 +; SI-NEXT: v_readlane_b32 s21, v44, 52 ; SI-NEXT: s_and_b32 s22, s22, 0xff ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_lshl_b32 s21, s21, 24 @@ -150638,16 +150675,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s21, s21, s22 ; SI-NEXT: s_or_b32 s20, s21, s20 ; SI-NEXT: s_add_i32 s21, s20, 0x3000000 -; SI-NEXT: v_readlane_b32 s20, v43, 43 +; SI-NEXT: v_readlane_b32 s20, v44, 1 ; SI-NEXT: s_add_i32 s45, s23, 0x3000000 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s22, v43, 42 -; SI-NEXT: v_readlane_b32 s23, v43, 44 +; SI-NEXT: v_readlane_b32 s22, v44, 0 +; SI-NEXT: v_readlane_b32 s23, v44, 49 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s22, s22, 8 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_or_b32 s20, s22, s20 -; SI-NEXT: v_readlane_b32 s22, v43, 45 +; SI-NEXT: v_readlane_b32 s22, v44, 50 ; SI-NEXT: s_and_b32 s23, s23, 0xff ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_lshl_b32 s22, s22, 24 @@ -150656,15 +150693,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: s_or_b32 s20, s22, s20 ; SI-NEXT: s_add_i32 s22, s20, 0x3000000 -; SI-NEXT: v_readlane_b32 s20, v43, 41 +; SI-NEXT: v_readlane_b32 s20, v44, 41 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s23, v43, 40 -; SI-NEXT: v_readlane_b32 s24, v43, 38 +; SI-NEXT: v_readlane_b32 s23, v44, 40 +; SI-NEXT: v_readlane_b32 s24, v44, 3 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s23, s23, 8 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_or_b32 s20, s23, s20 -; SI-NEXT: v_readlane_b32 s23, v43, 39 +; SI-NEXT: v_readlane_b32 s23, v44, 2 ; SI-NEXT: s_and_b32 s24, s24, 0xff ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_lshl_b32 s23, s23, 24 @@ -150673,134 +150710,136 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s23, s23, s24 ; SI-NEXT: s_or_b32 s20, s23, s20 ; SI-NEXT: s_add_i32 s23, s20, 0x3000000 -; SI-NEXT: v_readlane_b32 s20, v43, 37 +; SI-NEXT: v_readlane_b32 s20, v44, 39 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s24, v43, 36 -; SI-NEXT: v_readlane_b32 s25, v43, 34 +; SI-NEXT: v_readlane_b32 s24, v44, 38 +; SI-NEXT: v_readlane_b32 s25, v44, 36 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s24, s24, 8 ; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_or_b32 s20, s24, s20 -; SI-NEXT: v_readlane_b32 s24, v43, 35 +; SI-NEXT: v_readlane_b32 s24, v44, 37 ; SI-NEXT: s_and_b32 s25, s25, 0xff ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_lshl_b32 s24, s24, 24 ; SI-NEXT: s_lshl_b32 s25, s25, 16 ; SI-NEXT: s_and_b32 s20, s20, 0xffff ; SI-NEXT: s_or_b32 s24, s24, s25 -; SI-NEXT: s_and_b32 s46, s46, 0xff +; SI-NEXT: s_add_i32 s16, s16, 0x3000000 +; SI-NEXT: s_add_i32 s18, s18, 0x3000000 ; SI-NEXT: s_or_b32 s20, s24, s20 -; SI-NEXT: v_readlane_b32 s24, v43, 3 -; SI-NEXT: s_lshl_b32 s46, s46, 16 -; SI-NEXT: s_addk_i32 s56, 0x300 +; SI-NEXT: v_readlane_b32 s24, v44, 7 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_readlane_b32 s25, v43, 2 -; SI-NEXT: v_readlane_b32 s26, v43, 1 -; SI-NEXT: s_or_b32 s46, s47, s46 -; SI-NEXT: s_and_b32 s47, s56, 0xffff -; SI-NEXT: s_add_i32 s7, s7, 0x3000000 -; SI-NEXT: s_add_i32 s9, s9, 0x3000000 +; SI-NEXT: v_readlane_b32 s25, v44, 6 +; SI-NEXT: v_readlane_b32 s26, v44, 5 +; SI-NEXT: s_and_b32 s79, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s77, s18, 16 +; SI-NEXT: s_and_b32 s18, s16, 0xffff0000 ; SI-NEXT: s_and_b32 s24, s24, 0xff ; SI-NEXT: s_lshl_b32 s25, s25, 8 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s56, s46, s47 -; SI-NEXT: s_add_i32 s47, s58, 0x3000000 -; SI-NEXT: s_add_i32 s58, s59, 0x3000000 -; SI-NEXT: s_add_i32 s10, s10, 0x3000000 +; SI-NEXT: v_writelane_b32 v43, s18, 56 +; SI-NEXT: s_and_b32 s18, s17, 0xffff0000 +; SI-NEXT: s_and_b32 s46, s46, 0xff +; SI-NEXT: s_add_i32 s13, s13, 0x3000000 ; SI-NEXT: s_or_b32 s24, s25, s24 -; SI-NEXT: v_readlane_b32 s25, v43, 0 +; SI-NEXT: v_readlane_b32 s25, v44, 4 ; SI-NEXT: s_and_b32 s26, s26, 0xff -; SI-NEXT: s_and_b32 s73, s9, 0xffff0000 -; SI-NEXT: s_lshl_b32 s59, s9, 16 -; SI-NEXT: s_and_b32 s9, s7, 0xffff0000 -; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: v_writelane_b32 v43, s18, 57 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_addk_i32 s56, 0x300 +; SI-NEXT: s_add_i32 s12, s12, 0x3000000 +; SI-NEXT: s_add_i32 s19, s19, 0x3000000 ; SI-NEXT: s_addk_i32 s24, 0x300 ; SI-NEXT: s_lshl_b32 s25, s25, 24 ; SI-NEXT: s_lshl_b32 s26, s26, 16 -; SI-NEXT: s_and_b32 s63, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s79, s17, 16 -; SI-NEXT: v_writelane_b32 v42, s9, 50 -; SI-NEXT: s_lshl_b32 s17, s7, 16 -; SI-NEXT: s_lshl_b32 s7, s10, 16 -; SI-NEXT: s_add_i32 s8, s8, 0x3000000 +; SI-NEXT: v_writelane_b32 v43, s17, 58 +; SI-NEXT: s_and_b32 s17, s13, 0xffff0000 +; SI-NEXT: s_or_b32 s46, s47, s46 +; SI-NEXT: s_and_b32 s47, s56, 0xffff ; SI-NEXT: s_and_b32 s24, s24, 0xffff ; SI-NEXT: s_or_b32 s25, s25, s26 -; SI-NEXT: v_writelane_b32 v42, s7, 51 -; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_and_b32 s72, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s29, s23, 16 +; SI-NEXT: s_and_b32 s73, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s22, 16 +; SI-NEXT: s_and_b32 s78, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s19, 16 +; SI-NEXT: v_writelane_b32 v43, s17, 59 +; SI-NEXT: s_lshl_b32 s19, s13, 16 +; SI-NEXT: s_and_b32 s13, s12, 0xffff0000 +; SI-NEXT: s_or_b32 s56, s46, s47 +; SI-NEXT: s_add_i32 s11, s11, 0x3000000 ; SI-NEXT: s_or_b32 s24, s25, s24 -; SI-NEXT: v_writelane_b32 v42, s7, 52 -; SI-NEXT: s_and_b32 s7, s8, 0xffff0000 +; SI-NEXT: v_writelane_b32 v43, s13, 60 +; SI-NEXT: s_lshl_b32 s12, s12, 16 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s46, s60, 0x3000000 +; SI-NEXT: s_add_i32 s47, s58, 0x3000000 ; SI-NEXT: s_add_i32 s56, s56, 0x3000000 ; SI-NEXT: s_add_i32 s57, s57, 0x3000000 -; SI-NEXT: s_add_i32 s11, s11, 0x3000000 -; SI-NEXT: s_add_i32 s12, s12, 0x3000000 -; SI-NEXT: s_add_i32 s13, s13, 0x3000000 -; SI-NEXT: s_add_i32 s16, s16, 0x3000000 -; SI-NEXT: s_add_i32 s18, s18, 0x3000000 -; SI-NEXT: s_add_i32 s19, s19, 0x3000000 +; SI-NEXT: s_add_i32 s58, s59, 0x3000000 +; SI-NEXT: s_add_i32 s8, s8, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_add_i32 s10, s10, 0x3000000 +; SI-NEXT: s_add_i32 s7, s7, 0x3000000 +; SI-NEXT: s_add_i32 s9, s9, 0x3000000 ; SI-NEXT: s_add_i32 s20, s20, 0x3000000 ; SI-NEXT: s_add_i32 s24, s24, 0x3000000 -; SI-NEXT: v_writelane_b32 v42, s7, 53 -; SI-NEXT: s_lshl_b32 s7, s8, 16 +; SI-NEXT: v_writelane_b32 v43, s12, 61 +; SI-NEXT: s_and_b32 s12, s11, 0xffff0000 ; SI-NEXT: s_and_b32 s27, s24, 0xffff0000 ; SI-NEXT: s_lshl_b32 s26, s24, 16 ; SI-NEXT: s_and_b32 s24, s20, 0xffff0000 ; SI-NEXT: s_lshl_b32 s20, s20, 16 -; SI-NEXT: s_and_b32 s35, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s29, s23, 16 -; SI-NEXT: s_and_b32 s90, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s74, s22, 16 -; SI-NEXT: s_and_b32 s25, s21, 0xffff0000 +; SI-NEXT: s_and_b32 s75, s21, 0xffff0000 ; SI-NEXT: s_lshl_b32 s21, s21, 16 -; SI-NEXT: s_and_b32 s75, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s22, s19, 16 -; SI-NEXT: s_and_b32 s61, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s76, s18, 16 -; SI-NEXT: s_and_b32 s77, s16, 0xffff0000 ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_and_b32 s89, s13, 0xffff0000 -; SI-NEXT: s_lshl_b32 s19, s13, 16 -; SI-NEXT: s_and_b32 s13, s12, 0xffff0000 -; SI-NEXT: s_lshl_b32 s88, s12, 16 -; SI-NEXT: s_and_b32 s60, s11, 0xffff0000 +; SI-NEXT: v_writelane_b32 v43, s12, 62 ; SI-NEXT: s_lshl_b32 s18, s11, 16 -; SI-NEXT: s_and_b32 s23, s10, 0xffff0000 +; SI-NEXT: s_and_b32 s95, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_and_b32 s30, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s7, 16 +; SI-NEXT: s_and_b32 s34, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s31, s10, 16 +; SI-NEXT: s_and_b32 s35, s6, 0xffff0000 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: v_writelane_b32 v42, s7, 54 -; SI-NEXT: s_and_b32 s72, s58, 0xffff0000 +; SI-NEXT: s_and_b32 s37, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s36, s8, 16 +; SI-NEXT: s_and_b32 s38, s58, 0xffff0000 ; SI-NEXT: s_lshl_b32 s99, s58, 16 -; SI-NEXT: s_and_b32 s7, s57, 0xffff0000 -; SI-NEXT: s_lshl_b32 s57, s57, 16 +; SI-NEXT: s_and_b32 s48, s57, 0xffff0000 +; SI-NEXT: s_lshl_b32 s39, s57, 16 ; SI-NEXT: s_and_b32 s49, s56, 0xffff0000 -; SI-NEXT: s_lshl_b32 s8, s56, 16 +; SI-NEXT: s_lshl_b32 s98, s56, 16 ; SI-NEXT: s_and_b32 s51, s47, 0xffff0000 ; SI-NEXT: s_lshl_b32 s50, s47, 16 ; SI-NEXT: s_and_b32 s52, s46, 0xffff0000 -; SI-NEXT: s_lshl_b32 s97, s46, 16 +; SI-NEXT: s_lshl_b32 s12, s46, 16 ; SI-NEXT: s_and_b32 s54, s45, 0xffff0000 ; SI-NEXT: s_lshl_b32 s53, s45, 16 ; SI-NEXT: s_and_b32 s55, s44, 0xffff0000 -; SI-NEXT: s_lshl_b32 s28, s44, 16 +; SI-NEXT: s_lshl_b32 s96, s44, 16 ; SI-NEXT: s_and_b32 s65, s43, 0xffff0000 ; SI-NEXT: s_lshl_b32 s64, s43, 16 ; SI-NEXT: s_and_b32 s66, s42, 0xffff0000 ; SI-NEXT: s_lshl_b32 s87, s42, 16 -; SI-NEXT: s_and_b32 s68, s41, 0xffff0000 +; SI-NEXT: s_and_b32 s45, s41, 0xffff0000 ; SI-NEXT: s_lshl_b32 s67, s41, 16 -; SI-NEXT: s_and_b32 s69, s40, 0xffff0000 +; SI-NEXT: s_and_b32 s57, s40, 0xffff0000 ; SI-NEXT: s_lshl_b32 s86, s40, 16 -; SI-NEXT: s_and_b32 s62, s15, 0xffff0000 +; SI-NEXT: s_and_b32 s13, s15, 0xffff0000 ; SI-NEXT: s_lshl_b32 s70, s15, 16 ; SI-NEXT: s_and_b32 s80, s14, 0xffff0000 ; SI-NEXT: s_lshl_b32 s85, s14, 16 -; SI-NEXT: s_and_b32 s92, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s11, s5, 16 -; SI-NEXT: s_and_b32 s83, s4, 0xffff0000 +; SI-NEXT: s_and_b32 s82, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s25, s5, 16 +; SI-NEXT: s_and_b32 s28, s4, 0xffff0000 ; SI-NEXT: s_lshl_b32 s84, s4, 16 -; SI-NEXT: v_writelane_b32 v42, s7, 55 +; SI-NEXT: v_writelane_b32 v43, s9, 63 ; SI-NEXT: .LBB89_3: ; %end ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s27 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -150815,134 +150854,136 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s72 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s29 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s23 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s21 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s78 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s22 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s77 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: v_readlane_b32 s4, v43, 56 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: v_readlane_b32 s4, v43, 57 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_readlane_b32 s4, v43, 58 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s79 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: v_readlane_b32 s4, v43, 59 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: v_readlane_b32 s4, v43, 60 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_readlane_b32 s4, v43, 61 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s88 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: v_readlane_b32 s4, v43, 62 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 +; SI-NEXT: v_readlane_b32 s4, v43, 63 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s59 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: v_readlane_b32 s4, v42, 50 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s23 -; SI-NEXT: v_readlane_b32 s4, v42, 51 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s31 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: v_readlane_b32 s4, v42, 52 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s6 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: v_readlane_b32 s4, v42, 53 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_readlane_b32 s4, v42, 54 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s36 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s72 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s38 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s99 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: v_readlane_b32 s4, v42, 55 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s48 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s57 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s39 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s98 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -150956,7 +150997,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s97 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s12 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -150970,7 +151011,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s96 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -150989,21 +151030,21 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s67 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s86 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s70 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 @@ -151017,14 +151058,14 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s82 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s25 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s83 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s28 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s84 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 @@ -151071,109 +151112,118 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB89_4: -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; kill: killed $sgpr8 -; SI-NEXT: s_mov_b32 s7, s6 -; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr8 -; SI-NEXT: v_readlane_b32 s58, v43, 19 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: s_mov_b32 s95, s47 -; SI-NEXT: s_mov_b32 s94, s21 -; SI-NEXT: s_mov_b32 s93, s61 -; SI-NEXT: s_mov_b32 s34, s73 -; SI-NEXT: s_mov_b32 s91, s75 -; SI-NEXT: v_readlane_b32 s56, v43, 10 -; SI-NEXT: s_mov_b32 s36, s63 -; SI-NEXT: s_mov_b32 s38, s59 -; SI-NEXT: s_mov_b32 s37, s42 -; SI-NEXT: v_readlane_b32 s30, v43, 17 -; SI-NEXT: v_readlane_b32 s98, v43, 6 -; SI-NEXT: s_mov_b32 s46, s45 -; SI-NEXT: s_mov_b32 s31, s43 -; SI-NEXT: s_mov_b32 s78, s40 -; SI-NEXT: v_readlane_b32 s15, v43, 14 -; SI-NEXT: s_mov_b32 s39, s57 -; SI-NEXT: s_mov_b32 s48, s13 -; SI-NEXT: v_readlane_b32 s41, v43, 13 -; SI-NEXT: v_readlane_b32 s44, v43, 5 -; SI-NEXT: v_readlane_b32 s9, v43, 11 -; SI-NEXT: v_readlane_b32 s14, v43, 12 -; SI-NEXT: v_readlane_b32 s81, v43, 9 -; SI-NEXT: v_readlane_b32 s10, v43, 16 -; SI-NEXT: v_readlane_b32 s12, v43, 4 -; SI-NEXT: v_readlane_b32 s96, v43, 7 -; SI-NEXT: v_readlane_b32 s82, v43, 8 -; SI-NEXT: v_readlane_b32 s71, v43, 15 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: s_mov_b32 s89, s76 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: s_mov_b32 s88, s57 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: s_mov_b32 s94, s26 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: s_mov_b32 s93, s74 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: s_mov_b32 s90, s63 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: s_mov_b32 s91, s77 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: v_readlane_b32 s61, v44, 31 +; SI-NEXT: v_readlane_b32 s74, v44, 23 +; SI-NEXT: v_readlane_b32 s60, v44, 30 +; SI-NEXT: v_readlane_b32 s63, v44, 22 +; SI-NEXT: v_readlane_b32 s8, v44, 15 +; SI-NEXT: v_readlane_b32 s46, v44, 27 +; SI-NEXT: v_readlane_b32 s59, v44, 28 +; SI-NEXT: v_readlane_b32 s68, v44, 29 +; SI-NEXT: s_mov_b32 s92, s56 +; SI-NEXT: v_readlane_b32 s69, v44, 26 +; SI-NEXT: s_mov_b32 s76, s58 +; SI-NEXT: v_readlane_b32 s44, v44, 11 +; SI-NEXT: v_readlane_b32 s42, v44, 34 +; SI-NEXT: v_readlane_b32 s58, v44, 24 +; SI-NEXT: v_readlane_b32 s40, v44, 25 +; SI-NEXT: v_readlane_b32 s41, v44, 19 +; SI-NEXT: v_readlane_b32 s56, v44, 32 +; SI-NEXT: v_readlane_b32 s47, v44, 33 +; SI-NEXT: v_readlane_b32 s14, v44, 18 +; SI-NEXT: v_readlane_b32 s11, v44, 10 +; SI-NEXT: v_readlane_b32 s9, v44, 16 +; SI-NEXT: v_readlane_b32 s10, v44, 17 +; SI-NEXT: v_readlane_b32 s15, v44, 14 +; SI-NEXT: v_readlane_b32 s7, v44, 9 +; SI-NEXT: v_readlane_b32 s62, v44, 8 +; SI-NEXT: v_readlane_b32 s97, v44, 12 +; SI-NEXT: v_readlane_b32 s83, v44, 13 +; SI-NEXT: v_readlane_b32 s81, v44, 20 +; SI-NEXT: v_readlane_b32 s71, v44, 21 ; SI-NEXT: ; kill: killed $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr27 ; SI-NEXT: ; implicit-def: $sgpr20 ; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: ; implicit-def: $sgpr29 -; SI-NEXT: ; implicit-def: $sgpr35 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr73 ; SI-NEXT: ; implicit-def: $sgpr21 -; SI-NEXT: ; implicit-def: $sgpr25 -; SI-NEXT: ; implicit-def: $sgpr22 ; SI-NEXT: ; implicit-def: $sgpr75 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr77 ; SI-NEXT: ; implicit-def: $sgpr79 -; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr16 ; SI-NEXT: ; implicit-def: $sgpr19 -; SI-NEXT: ; implicit-def: $sgpr89 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr18 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $sgpr73 -; SI-NEXT: ; implicit-def: $sgpr17 ; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr34 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr37 ; SI-NEXT: ; implicit-def: $sgpr99 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; kill: killed $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr98 ; SI-NEXT: ; implicit-def: $sgpr49 ; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; implicit-def: $sgpr51 -; SI-NEXT: ; implicit-def: $sgpr97 +; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr52 ; SI-NEXT: ; implicit-def: $sgpr53 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr96 ; SI-NEXT: ; implicit-def: $sgpr55 ; SI-NEXT: ; implicit-def: $sgpr64 ; SI-NEXT: ; implicit-def: $sgpr65 ; SI-NEXT: ; implicit-def: $sgpr87 ; SI-NEXT: ; implicit-def: $sgpr66 ; SI-NEXT: ; implicit-def: $sgpr67 -; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr45 ; SI-NEXT: ; implicit-def: $sgpr86 -; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr57 ; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr85 ; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr82 ; SI-NEXT: ; implicit-def: $sgpr84 -; SI-NEXT: ; implicit-def: $sgpr83 +; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: s_branch .LBB89_2 ; ; VI-LABEL: bitcast_v128i8_to_v64bf16_scalar: @@ -151236,13 +151286,14 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v40, 8, v27 ; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 @@ -151254,46 +151305,42 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v8 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v10 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v12 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v26 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v24 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 @@ -151302,34 +151349,37 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 -; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v22 -; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v26 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v28 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v2 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 @@ -151348,6 +151398,11 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 @@ -151356,12 +151411,8 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:36 -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 ; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v3 ; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 -; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v0 ; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:44 @@ -151370,47 +151421,45 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76 ; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:108 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:116 ; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:172 ; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:180 ; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:196 ; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:212 ; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:252 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:284 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:268 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:324 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:324 ; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill @@ -151420,46 +151469,50 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB89_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload @@ -151476,11 +151529,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -151504,6 +151556,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v17, v10 ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload @@ -151520,38 +151573,43 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v0, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v42 +; VI-NEXT: v_mov_b32_e32 v42, v44 +; VI-NEXT: v_mov_b32_e32 v44, v45 +; VI-NEXT: v_mov_b32_e32 v45, v62 +; VI-NEXT: v_or_b32_sdwa v2, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v53, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v34, v24 ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v36, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v37, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -151559,77 +151617,74 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_or_b32_sdwa v0, v38, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v39, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v48, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v45, v62 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v48, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v32, v1 ; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v54, v22 -; VI-NEXT: v_mov_b32_e32 v41, v24 ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v34, v0 +; VI-NEXT: v_mov_b32_e32 v33, v0 ; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v37, v1 -; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v55, v26 +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v50, v26 ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v39, v0 -; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v49, v1 -; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v43, v27 +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v51, v0 -; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v35, v1 -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v53, v28 +; VI-NEXT: v_mov_b32_e32 v53, v1 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v52, v28 ; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v47, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v0 -; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v57, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v47, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v36, v0 +; VI-NEXT: v_mov_b32_e32 v55, v0 +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v35, v0 ; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v41, v1 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v63, v27 +; VI-NEXT: v_mov_b32_e32 v46, v57 ; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v56, v0 +; VI-NEXT: v_mov_b32_e32 v36, v0 ; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v58, v1 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v61, v60 -; VI-NEXT: v_mov_b32_e32 v60, v59 +; VI-NEXT: v_mov_b32_e32 v56, v1 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v61, v59 ; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload @@ -151641,55 +151696,53 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v45, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v44, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v50, v0 +; VI-NEXT: v_mov_b32_e32 v58, v0 ; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v1, v62, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v52, v0 -; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v0 +; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v59, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v46, v1 -; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v60, v1 +; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v63, v0 -; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v0 +; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v47, v1 -; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v57, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 @@ -151721,12 +151774,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_cbranch_execnz .LBB89_3 ; VI-NEXT: .LBB89_2: ; %cmp.true -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v59 -; VI-NEXT: v_or_b32_sdwa v29, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 @@ -151745,165 +151796,147 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: s_lshl_b32 s9, s19, 8 ; VI-NEXT: s_add_i32 s16, s16, 3 ; VI-NEXT: s_lshl_b32 s10, s17, 8 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v26, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v28, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v59 +; VI-NEXT: v_or_b32_sdwa v25, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v62 -; VI-NEXT: v_or_b32_sdwa v28, v43, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 -; VI-NEXT: v_or_b32_sdwa v53, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v27, v63, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v45 -; VI-NEXT: v_or_b32_sdwa v27, v55, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v52, v43, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 +; VI-NEXT: v_or_b32_sdwa v26, v50, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 -; VI-NEXT: v_or_b32_sdwa v52, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v63, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v40 -; VI-NEXT: v_or_b32_sdwa v25, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v60 -; VI-NEXT: v_or_b32_sdwa v59, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v43, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v61 -; VI-NEXT: v_or_b32_sdwa v24, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v59, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v46 +; VI-NEXT: v_or_b32_sdwa v24, v56, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v48, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v48, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 ; VI-NEXT: v_or_b32_sdwa v24, v24, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: v_or_b32_sdwa v23, v41, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v38, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v38, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 ; VI-NEXT: v_or_b32_sdwa v23, v23, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v22, v54, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v22, v34, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v50, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v50 +; VI-NEXT: v_or_b32_sdwa v36, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v36 ; VI-NEXT: v_or_b32_sdwa v22, v22, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v22 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v21, v35, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v21, v53, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v54, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v53, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: v_or_b32_sdwa v20, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: v_or_b32_sdwa v49, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v49 ; VI-NEXT: v_or_b32_sdwa v20, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: v_or_b32_sdwa v19, v37, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v37, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v37, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v37 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v31, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v19, v19, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: v_or_b32_sdwa v18, v32, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v57, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v57 -; VI-NEXT: v_or_b32_sdwa v18, v18, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v58, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v58 +; VI-NEXT: v_or_b32_sdwa v18, v18, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v56, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v56, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v34, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v34 ; VI-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 @@ -151912,67 +151945,78 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v13, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v36, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v35, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v36 -; VI-NEXT: v_or_b32_sdwa v13, v13, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v26 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v52 -; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v54 -; VI-NEXT: v_or_b32_sdwa v21, v21, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v35 +; VI-NEXT: v_or_b32_sdwa v13, v13, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v25 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x300, v59 +; VI-NEXT: v_or_b32_sdwa v25, v43, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v28, v28, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v12, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v30, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v51, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v51 ; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v59 -; VI-NEXT: v_or_b32_sdwa v25, v25, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v1 +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v63 +; VI-NEXT: v_or_b32_sdwa v26, v26, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v30, v30, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v33, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v57, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v40, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_or_b32_sdwa v30, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v39, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v2 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -151996,15 +152040,14 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v10 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v55 ; VI-NEXT: v_or_b32_sdwa v10, v39, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v53 -; VI-NEXT: v_or_b32_sdwa v27, v28, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v28, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v29, v30, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v52 +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v53 +; VI-NEXT: v_or_b32_sdwa v21, v21, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v27, v27, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -152020,18 +152063,14 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v42 ; VI-NEXT: v_or_b32_sdwa v8, v8, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v11 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v40 -; VI-NEXT: v_or_b32_sdwa v11, v33, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v30, v31, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v17, v17, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v40 +; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v7, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload @@ -152071,19 +152110,29 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_or_b32_sdwa v46, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v29, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v46, vcc, 0x300, v46 ; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v2 +; VI-NEXT: v_or_b32_sdwa v29, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 ; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v47, vcc, 3, v32 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x300, v4 ; VI-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 @@ -152150,35 +152199,38 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB89_4: -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v61, v60 -; VI-NEXT: v_mov_b32_e32 v60, v59 +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v61, v59 +; VI-NEXT: v_mov_b32_e32 v46, v57 +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v40, v42 +; VI-NEXT: v_mov_b32_e32 v42, v44 +; VI-NEXT: v_mov_b32_e32 v44, v45 ; VI-NEXT: v_mov_b32_e32 v45, v62 -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v57, v5 +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v47, v4 -; VI-NEXT: v_mov_b32_e32 v63, v3 -; VI-NEXT: v_mov_b32_e32 v53, v28 -; VI-NEXT: v_mov_b32_e32 v43, v27 -; VI-NEXT: v_mov_b32_e32 v55, v26 -; VI-NEXT: v_mov_b32_e32 v41, v24 -; VI-NEXT: v_mov_b32_e32 v54, v22 +; VI-NEXT: v_mov_b32_e32 v54, v3 +; VI-NEXT: v_mov_b32_e32 v52, v28 +; VI-NEXT: v_mov_b32_e32 v63, v27 +; VI-NEXT: v_mov_b32_e32 v50, v26 +; VI-NEXT: v_mov_b32_e32 v34, v24 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_branch .LBB89_2 @@ -152240,18 +152292,18 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v7 ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v11 ; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; GFX9-NEXT: s_waitcnt vmcnt(24) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 ; GFX9-NEXT: s_waitcnt vmcnt(23) @@ -152280,10 +152332,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill @@ -152295,7 +152347,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v30 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill @@ -152343,7 +152395,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill @@ -152370,23 +152422,23 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 @@ -152399,48 +152451,49 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v3 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:52 ; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 ; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76 ; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:92 ; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:100 ; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:116 ; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:132 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:140 ; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:148 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:156 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:164 -; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204 ; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:220 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:228 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 ; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:244 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 -; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:260 ; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:276 ; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:284 ; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:308 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:316 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:324 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill @@ -152451,55 +152504,54 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(33) ; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(36) -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(39) +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(40) +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(40) +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: s_waitcnt vmcnt(40) ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: s_waitcnt vmcnt(40) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(40) ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill @@ -152509,7 +152561,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB89_2 @@ -152522,7 +152574,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_or_b32_sdwa v2, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -152559,10 +152611,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload @@ -152578,13 +152630,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -152592,7 +152644,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload @@ -152633,8 +152685,8 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: v_mov_b32_e32 v52, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_mov_b32_e32 v50, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -152652,16 +152704,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_mov_b32_e32 v48, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v1 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_mov_b32_e32 v33, v45 +; GFX9-NEXT: v_mov_b32_e32 v33, v46 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v18, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload @@ -152674,7 +152726,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -152683,7 +152735,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -152691,121 +152743,122 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v22, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v34, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v23, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_mov_b32_e32 v46, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v1, v35, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v24, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v35, v45 -; GFX9-NEXT: v_mov_b32_e32 v45, v61 -; GFX9-NEXT: v_mov_b32_e32 v61, v42 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_mov_b32_e32 v38, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshl_or_b32 v25, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v54, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v54, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v41, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v41, v57 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshl_or_b32 v27, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v60, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v60, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v28, v1, 16, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v59, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v57, v59 ; GFX9-NEXT: v_lshl_or_b32 v29, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v56, v42 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB89_3 ; GFX9-NEXT: .LBB89_2: ; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v33, v45 -; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v33, v46 +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v56, v61 +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: .LBB89_3: ; %Flow ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -153008,7 +153061,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -153068,11 +153121,11 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v48, v46, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v48, v40, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v49, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v49, v46, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 @@ -153107,7 +153160,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: v_or_b32_sdwa v53, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v61 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v62 ; GFX9-NEXT: v_or_b32_sdwa v24, v54, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v24 ; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v48 @@ -153116,7 +153169,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v54, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v45 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v61 ; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 ; GFX9-NEXT: v_or_b32_sdwa v20, v57, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v34, 0x300, v20 @@ -153125,7 +153178,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v26, 3, v56 ; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 -; GFX9-NEXT: v_or_b32_sdwa v21, v32, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v21, v45, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v28, 0x300, v21 ; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v54 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 @@ -154938,29 +154991,30 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -155049,31 +155103,31 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v39 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v55 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v40 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v43 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v44 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v56 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v47 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v60 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v63 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 @@ -155084,21 +155138,21 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v58 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; kill: killed $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v48 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v49 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v51 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v52 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v55 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v44 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v63 -; SI-NEXT: ; kill: killed $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v42 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v54 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v45 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr35 @@ -155122,30 +155176,30 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v1 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v2 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v3 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v4 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v7 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; kill: killed $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v5 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v8 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_mul_f32_e32 v21, 1.0, v10 ; SI-NEXT: ; kill: killed $vgpr7 @@ -155157,7 +155211,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; kill: killed $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -155330,9 +155384,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v3 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -155345,10 +155399,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(4) @@ -155378,7 +155432,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v26, v1, v2, 16 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_alignbit_b32 v40, v1, v2, 16 @@ -155390,7 +155444,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v23, v1, v2, 16 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_alignbit_b32 v54, v1, v2, 16 @@ -155402,7 +155456,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v18, v1, v2, 16 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_alignbit_b32 v52, v1, v2, 16 @@ -155411,88 +155465,148 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v15, v1, v57, 16 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v15, v1, v16, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: v_alignbit_b32 v50, v1, v2, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v42 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v58 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v13, v1, v61, 16 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: v_alignbit_b32 v13, v1, v59, 16 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v58 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_alignbit_b32 v48, v1, v2, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v11, v1, v60, 16 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v11, v1, v51, 16 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_alignbit_b32 v37, v1, v2, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v8, v1, v51, 16 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_alignbit_b32 v8, v1, v46, 16 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v51 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_alignbit_b32 v34, v1, v2, 16 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v46 -; SI-NEXT: v_alignbit_b32 v5, v1, v53, 16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_alignbit_b32 v5, v1, v6, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v53 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_alignbit_b32 v31, v1, v2, 16 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 ; SI-NEXT: v_alignbit_b32 v4, v1, v9, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v63 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_alignbit_b32 v28, v1, v2, 16 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 -; SI-NEXT: v_alignbit_b32 v3, v1, v41, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v41 +; SI-NEXT: v_alignbit_b32 v3, v1, v43, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v57 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_alignbit_b32 v25, v1, v2, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63 -; SI-NEXT: v_alignbit_b32 v2, v1, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 +; SI-NEXT: v_alignbit_b32 v2, v1, v36, 16 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; SI-NEXT: v_alignbit_b32 v22, v1, v7, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_alignbit_b32 v20, v7, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v6 +; SI-NEXT: v_alignbit_b32 v22, v1, v6, 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v56 +; SI-NEXT: v_alignbit_b32 v20, v6, v7, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v62 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_alignbit_b32 v1, v1, v60, 16 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_alignbit_b32 v17, v6, v7, 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v47 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v14, v6, v39, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v42 +; SI-NEXT: v_alignbit_b32 v10, v6, v44, 16 +; SI-NEXT: v_mov_b32_e32 v63, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v61 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v27 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v58 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v49 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v51 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v53 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v12 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v57 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v56 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v44 +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v42 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v30 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v40, v38, 24 @@ -155595,9 +155709,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v20, v5, 24 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v19 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v20, v5, 16 @@ -155605,49 +155716,33 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v20, v5, 8 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_alignbit_b32 v17, v7, v9, 16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v6, v17, v4, 24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_alignbit_b32 v6, v17, v4, 24 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v17, v4, 16 -; SI-NEXT: v_alignbit_b32 v14, v7, v27, 16 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v17, v4, 8 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v14, v3, 24 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v36 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v14, v3, 16 -; SI-NEXT: v_alignbit_b32 v10, v7, v39, 16 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v14, v3, 8 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v10, v2, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v30 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v10, v2, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v43, 16 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v7, v7, v55, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v45, 16 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v10, v2, 8 @@ -155752,173 +155847,135 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v58 ; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v33 ; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v47 ; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v57 ; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v42 ; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v59 ; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v49 ; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v51 ; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v62 ; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v45 ; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v24 ; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 ; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: v_lshrrev_b32_e32 v27, 24, v62 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v47 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v19 -; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v36 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: .LBB90_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB90_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_alignbit_b32 v8, v11, v8, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_alignbit_b32 v11, v14, v11, 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v19 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v14, v14, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v58 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v41 ; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v33 ; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v44 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v46 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v5, v8, v5, 16 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v36 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v51 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_alignbit_b32 v8, v11, v8, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v12 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v60 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_alignbit_b32 v11, v14, v11, 16 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v6, v7, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v14, v14, v13, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v42 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v6 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v9 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v10, v63, v10, 16 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v19 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload @@ -155927,39 +155984,41 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_alignbit_b32 v13, v17, v13, 16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v21 ; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v17, v17, v15, 16 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_alignbit_b32 v23, v26, v23, 16 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_alignbit_b32 v29, v32, v29, 16 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 @@ -155968,7 +156027,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v19 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49 ; SI-NEXT: s_waitcnt vmcnt(5) @@ -155987,6 +156045,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v49 ; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v51 ; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v55 ; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v41 ; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill @@ -155994,91 +156060,80 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v15 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v15, v20, v15, 16 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v20, v18, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v6 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v9 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v18 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v30 +; SI-NEXT: v_alignbit_b32 v20, v16, v18, 16 ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v18 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v18 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_alignbit_b32 v18, v23, v18, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v18, v22, v18, 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v21, 24, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v23 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v45 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v22, v22, v16, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_alignbit_b32 v23, v26, v23, 16 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v62 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v26 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_alignbit_b32 v26, v29, v26, 16 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v26, v28, v26, 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v27, 24, v27 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v28, v28, v27, 16 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v12, 24, v12 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 24, v16 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 24, v24 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 24, v30 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_alignbit_b32 v29, v32, v29, 16 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 ; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v30, 24, v30 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v27, 24, v27 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 24, v21 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v34, 0x40c00000, v32 @@ -156098,8 +156153,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v33 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v35 @@ -156361,8 +156414,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: .LBB90_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v38 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 @@ -156816,7 +156869,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v47 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 @@ -156849,7 +156902,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v33 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v61 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 @@ -156880,17 +156933,17 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v58 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v63 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 @@ -156920,7 +156973,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -156971,12 +157024,65 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; kill: killed $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; kill: killed $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; kill: killed $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; kill: killed $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; kill: killed $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; kill: killed $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; kill: killed $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; kill: killed $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; kill: killed $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; kill: killed $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; kill: killed $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; kill: killed $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; kill: killed $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; kill: killed $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; kill: killed $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; kill: killed $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; kill: killed $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; kill: killed $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; kill: killed $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; kill: killed $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; kill: killed $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; kill: killed $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; kill: killed $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; kill: killed $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; kill: killed $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; kill: killed $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: ; implicit-def: $vgpr60 ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr42 @@ -156989,11 +157095,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr46 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: ; implicit-def: $vgpr62 ; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr52 @@ -157096,173 +157204,115 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; kill: killed $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB90_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v29 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v28 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v28 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v27 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v27 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v26 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v26 ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v25 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v23 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v23 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v6 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v6 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v5 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v21 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v4 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v4 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v3 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v19 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v2 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[15:16] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v1 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v17 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[13:14] @@ -157287,17 +157337,15 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[31:32] -; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v12 -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v45, v46 ; VI-NEXT: v_lshrrev_b64 v[46:47], 24, v[29:30] -; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v11 ; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v31 +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v46, v63 -; VI-NEXT: v_mov_b32_e32 v63, v50 +; VI-NEXT: v_mov_b32_e32 v46, v51 +; VI-NEXT: v_mov_b32_e32 v45, v50 ; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[27:28] ; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v10 ; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 @@ -157319,7 +157367,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[17:18] ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v12 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 ; VI-NEXT: v_lshrrev_b32_e32 v40, 24, v10 @@ -157328,30 +157377,29 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v8 ; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 ; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v32 ; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v32 ; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v32 ; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v31 -; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v30 -; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v30 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v30 +; VI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v53, 24, v18 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v18 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v17 ; VI-NEXT: v_mov_b32_e32 v47, v34 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v2 ; VI-NEXT: .LBB90_2: ; %Flow ; VI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v34, v36 ; VI-NEXT: s_xor_b64 exec, exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB90_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v18 ; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 ; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 ; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 @@ -157970,109 +158018,112 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v28 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v28 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v28 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v27 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v27 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v26 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v26 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v26 ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v25 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v25 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v23 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v23 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v6 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v6 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v5 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v21 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v4 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v4 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v19 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v3 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v19 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v2 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v18 -; VI-NEXT: v_lshrrev_b64 v[56:57], 24, v[25:26] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v12 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: v_lshrrev_b64 v[56:57], 24, v[25:26] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 ; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v12 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 ; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v11 @@ -158086,69 +158137,63 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v8 ; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 ; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v6 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v32 ; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v32 ; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v32 ; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v31 -; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v31 -; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v30 -; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v30 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v29 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v17 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v31 +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v53, 24, v18 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v18 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v17 ; VI-NEXT: .LBB90_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v44 +; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v63 ; VI-NEXT: v_or_b32_sdwa v1, v1, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v43 ; VI-NEXT: v_or_b32_sdwa v2, v2, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v43 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v43, v44, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v43, v36, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -158161,22 +158206,28 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v38 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v54 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -158224,10 +158275,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v49 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v45 ; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -158287,48 +158342,34 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v38 ; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v55 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v53 ; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v62 ; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -158339,10 +158380,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -158352,9 +158393,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -158365,10 +158406,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -158378,9 +158419,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -158391,8 +158432,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v56 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -158402,9 +158443,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -158415,10 +158456,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -158429,9 +158470,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -158442,26 +158483,34 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v53 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v34 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v63 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v45 ; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 @@ -158514,714 +158563,716 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v46, v15 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: v_mov_b32_e32 v47, v16 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; kill: killed $vgpr59 +; GFX9-NEXT: v_mov_b32_e32 v36, v15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: v_mov_b32_e32 v37, v16 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr16 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr45 ; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr45 ; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr57 -; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr59 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(33) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GFX9-NEXT: ; implicit-def: $vgpr15 -; GFX9-NEXT: ; kill: killed $vgpr15 -; GFX9-NEXT: ; implicit-def: $vgpr15 -; GFX9-NEXT: ; kill: killed $vgpr15 -; GFX9-NEXT: ; implicit-def: $vgpr15 -; GFX9-NEXT: ; kill: killed $vgpr15 -; GFX9-NEXT: ; implicit-def: $vgpr15 -; GFX9-NEXT: ; kill: killed $vgpr15 -; GFX9-NEXT: ; implicit-def: $vgpr15 -; GFX9-NEXT: ; kill: killed $vgpr15 -; GFX9-NEXT: ; implicit-def: $vgpr15 -; GFX9-NEXT: ; kill: killed $vgpr15 -; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB90_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v47 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(38) -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v47 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v47 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v46 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v46 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v37 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v37 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v36 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v13 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v11 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v10 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v10 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v9 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 24, v6 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v18 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v8 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[46:47] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[11:12] -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[9:10] -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[7:8] -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[5:6] -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[3:4] -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[1:2] -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[31:32] -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[29:30] -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[27:28] -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[25:26] -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[23:24] -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[21:22] -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[19:20] -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[17:18] -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v8 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v8 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v17 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v6 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v2 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v32 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v32 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(59) +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v31 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v31 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v30 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v30 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v30 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v29 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v28 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v28 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v28 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v27 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v26 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v26 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v26 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v25 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v25 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v24 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v24 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v23 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v22 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v22 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v21 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v21 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v20 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v18 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[36:37] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[3:4] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[1:2] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 24, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 24, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v17 +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-NEXT: .LBB90_2: ; %Flow -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v58, v57 -; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB90_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v18 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 ; GFX9-NEXT: s_movk_i32 s6, 0x7fff -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v33, v16, v33, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v18, vcc +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_add3_u32 v34, v34, v33, s6 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v39, v34, v35, vcc +; GFX9-NEXT: v_add3_u32 v33, v33, v18, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v62, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_add3_u32 v33, v33, v18, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v33, vcc +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_perm_b32 v15, v17, v15, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v20 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v20 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v63, v18, v33, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_perm_b32 v16, v62, v39, s7 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v19 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v18, v20, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v19 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_perm_b32 v16, v15, v63, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v18, v20, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: v_perm_b32 v34, v15, v33, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v18, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_perm_b32 v33, v15, v18, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v20 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v20 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: v_perm_b32 v15, v17, v15, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v22 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v22 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: v_perm_b32 v18, v15, v18, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v19 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v19 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: v_perm_b32 v17, v15, v20, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v22 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v21 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v21 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v43, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_perm_b32 v16, v16, v15, s7 +; GFX9-NEXT: v_perm_b32 v15, v17, v43, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v24 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v24 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v38, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v23 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v22 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_perm_b32 v18, v15, v18, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v21 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v21 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v61, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: v_perm_b32 v17, v15, v61, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v24 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_perm_b32 v15, v17, v22, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v26 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v26 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v25 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v25 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v50, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_perm_b32 v16, v24, v38, s7 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v24 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_perm_b32 v59, v15, v18, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v23 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v23 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: v_perm_b32 v58, v15, v18, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v26 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v26 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: v_perm_b32 v63, v15, v18, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v25 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v25 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: v_perm_b32 v62, v15, v18, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v28 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v28 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v60, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v27 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v27 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: v_perm_b32 v56, v15, v26, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v30 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v30 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v29 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v30, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v29 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: v_perm_b32 v33, v15, v25, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v32 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v32 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: s_waitcnt vmcnt(51) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v31 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v31 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: v_perm_b32 v35, v15, v24, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_perm_b32 v15, v17, v50, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v28 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v28 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v27 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v27 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v51, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_perm_b32 v16, v26, v23, s7 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_perm_b32 v15, v17, v51, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v30 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v30 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v29 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v29 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v48, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_perm_b32 v16, v28, v25, s7 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_perm_b32 v15, v17, v48, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v32 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v32 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: s_waitcnt vmcnt(47) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v31 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v31 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v49, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_perm_b32 v16, v30, v27, s7 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_perm_b32 v15, v17, v49, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 ; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v16, v17, vcc -; GFX9-NEXT: v_add3_u32 v15, v15, v2, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v2 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v18, v19, vcc +; GFX9-NEXT: v_add3_u32 v17, v17, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v15, v16, vcc -; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_add3_u32 v15, v15, v2, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v2 +; GFX9-NEXT: v_add3_u32 v17, v17, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v15, v16, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v52, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v15, vcc -; GFX9-NEXT: v_perm_b32 v37, v1, v23, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc +; GFX9-NEXT: v_perm_b32 v60, v1, v52, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v4 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v2, v15, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v20, v2, v17, vcc ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 @@ -159235,13 +159286,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v2, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v53, v2, v4, vcc ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_perm_b32 v48, v1, v20, s7 +; GFX9-NEXT: v_perm_b32 v54, v1, v53, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v6 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -159250,7 +159301,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: v_cndmask_b32_e32 v18, v2, v3, vcc ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 @@ -159271,7 +159321,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_perm_b32 v50, v1, v17, s7 +; GFX9-NEXT: v_perm_b32 v33, v1, v17, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v8 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -159300,7 +159350,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v7, vcc -; GFX9-NEXT: v_perm_b32 v52, v1, v4, s7 +; GFX9-NEXT: v_perm_b32 v56, v1, v4, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v10 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -159319,480 +159369,515 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cndmask_b32_e32 v10, v2, v10, vcc ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v2, v15, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v2, v42, vcc ; GFX9-NEXT: v_add3_u32 v9, v9, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v15, vcc -; GFX9-NEXT: v_perm_b32 v39, v1, v3, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v42, vcc +; GFX9-NEXT: v_perm_b32 v40, v1, v3, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v12 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 ; GFX9-NEXT: v_add3_u32 v9, v9, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX9-NEXT: v_or_b32_e32 v44, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_bfe_u32 v12, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v44, vcc ; GFX9-NEXT: v_add3_u32 v12, v12, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX9-NEXT: v_or_b32_e32 v44, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v11 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v12, v15, vcc -; GFX9-NEXT: v_bfe_u32 v15, v1, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v12, v44, vcc +; GFX9-NEXT: v_bfe_u32 v44, v1, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_add3_u32 v15, v15, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v1 +; GFX9-NEXT: v_add3_u32 v44, v44, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v46, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v15, v16, vcc -; GFX9-NEXT: v_bfe_u32 v15, v11, 16, 1 -; GFX9-NEXT: v_add3_u32 v15, v15, v11, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v44, v46, vcc +; GFX9-NEXT: v_bfe_u32 v44, v11, 16, 1 +; GFX9-NEXT: v_add3_u32 v44, v44, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v46, 0x400000, v11 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v15, v16, vcc -; GFX9-NEXT: v_perm_b32 v54, v11, v2, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v44, v46, vcc +; GFX9-NEXT: v_perm_b32 v44, v11, v2, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v14 ; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GFX9-NEXT: v_bfe_u32 v15, v11, 16, 1 +; GFX9-NEXT: v_bfe_u32 v46, v11, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_add3_u32 v15, v15, v11, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v11 +; GFX9-NEXT: v_add3_u32 v46, v46, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v47, 0x400000, v11 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v15, v16, vcc -; GFX9-NEXT: v_bfe_u32 v15, v14, 16, 1 -; GFX9-NEXT: v_add3_u32 v15, v15, v14, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v46, v47, vcc +; GFX9-NEXT: v_bfe_u32 v46, v14, 16, 1 +; GFX9-NEXT: v_add3_u32 v46, v46, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v47, 0x400000, v14 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v13 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v46, v47, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v46, 0x40c00000, v46 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_bfe_u32 v58, v46, 16, 1 ; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_bfe_u32 v15, v13, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v16, v41, vcc -; GFX9-NEXT: v_add3_u32 v15, v15, v13, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v13 +; GFX9-NEXT: v_add3_u32 v58, v58, v46, s6 +; GFX9-NEXT: v_or_b32_e32 v59, 0x400000, v46 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v46, v46 +; GFX9-NEXT: v_bfe_u32 v46, v13, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v58, v59, vcc +; GFX9-NEXT: v_add3_u32 v46, v46, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v58, 0x400000, v13 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v15, v16, vcc -; GFX9-NEXT: v_perm_b32 v41, v13, v1, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v47 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v46, v58, vcc +; GFX9-NEXT: v_perm_b32 v16, v32, v29, s7 +; GFX9-NEXT: v_perm_b32 v46, v13, v1, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v37 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX9-NEXT: v_bfe_u32 v15, v13, 16, 1 -; GFX9-NEXT: v_add3_u32 v15, v15, v13, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v13 +; GFX9-NEXT: v_bfe_u32 v58, v13, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v37 +; GFX9-NEXT: v_add3_u32 v58, v58, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v59, 0x400000, v13 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v15, v16, vcc -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v47 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v44, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v44, v16, v44, vcc -; GFX9-NEXT: v_perm_b32 v16, v44, v13, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v58, v59, vcc +; GFX9-NEXT: v_bfe_u32 v58, v16, 16, 1 +; GFX9-NEXT: v_add3_u32 v58, v58, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v59, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v58, v59, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 16, v36 +; GFX9-NEXT: v_add_f32_e32 v58, 0x40c00000, v58 +; GFX9-NEXT: v_bfe_u32 v35, v58, 16, 1 +; GFX9-NEXT: v_add3_u32 v35, v35, v58, s6 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v58 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v58, v58 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v35, v15, vcc +; GFX9-NEXT: v_and_b32_e32 v35, 0xffff0000, v36 +; GFX9-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX9-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX9-NEXT: v_add3_u32 v36, v36, v35, s6 +; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 +; GFX9-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc +; GFX9-NEXT: v_perm_b32 v58, v35, v15, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v32 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v30 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v28 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v26 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v24 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; GFX9-NEXT: v_perm_b32 v59, v16, v13, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v14 -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; GFX9-NEXT: v_perm_b32 v53, v8, v5, s7 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX9-NEXT: v_perm_b32 v55, v19, v20, s7 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX9-NEXT: v_perm_b32 v57, v8, v5, s7 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v32 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v30 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v28 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v34, v30, v27, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GFX9-NEXT: v_perm_b32 v36, v32, v29, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v25 -; GFX9-NEXT: v_perm_b32 v38, v22, v31, s7 -; GFX9-NEXT: v_perm_b32 v42, v14, v11, s7 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v21 +; GFX9-NEXT: v_perm_b32 v47, v14, v11, s7 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_perm_b32 v45, v12, v9, s7 +; GFX9-NEXT: v_perm_b32 v41, v10, v7, s7 +; GFX9-NEXT: v_perm_b32 v34, v6, v18, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v39 +; GFX9-NEXT: v_mov_b32_e32 v38, v54 +; GFX9-NEXT: v_perm_b32 v61, v21, v31, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v43 +; GFX9-NEXT: v_mov_b32_e32 v43, v34 +; GFX9-NEXT: v_mov_b32_e32 v39, v55 +; GFX9-NEXT: v_mov_b32_e32 v42, v33 +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[38:39] ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v31 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v22 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v23 -; GFX9-NEXT: v_perm_b32 v55, v12, v9, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v48 +; GFX9-NEXT: v_mov_b32_e32 v48, v60 +; GFX9-NEXT: v_mov_b32_e32 v49, v61 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v50 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v20 -; GFX9-NEXT: v_perm_b32 v49, v19, v21, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v46 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v45, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v45, v45, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v46 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v43, v45, v43, vcc -; GFX9-NEXT: v_bfe_u32 v45, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v45, v45, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v46, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v45, v46, vcc -; GFX9-NEXT: v_perm_b32 v15, v15, v43, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v44 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v43 -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[15:16] -; GFX9-NEXT: v_perm_b32 v51, v6, v18, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v18 -; GFX9-NEXT: v_perm_b32 v40, v10, v7, s7 -; GFX9-NEXT: v_perm_b32 v57, v28, v60, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v41 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v42 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v60 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v61 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v39 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v17 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v17 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v63 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v17 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v62 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[58:59] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[46:47] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[41:42] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[44:45] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[54:55] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[40:41] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[39:40] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[56:57] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[52:53] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[50:51] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[48:49] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[48:49] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[42:43] +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 24, v59 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[61:62] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[37:38] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[35:36] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v62 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[33:34] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[33:34] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[56:57] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[33:34] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[62:63] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[58:59] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[33:34] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[60:61] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[33:34] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[43:44] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[33:34] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[43:44] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[33:34] +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v16 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v42 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v42 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v41 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v41 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v55 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v55 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v54 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v40 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v39 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[50:51] +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v59 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v58 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 24, v47 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v53 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v53 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 24, v51 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v51 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 24, v38 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v38 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v37 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v37 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v36 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v36 -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v35 -; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v34 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v57 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v57 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v56 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v56 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v63 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v63 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v62 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v50 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v62 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v52 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v50 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v49 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v59 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v47 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v46 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v46 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v45 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v44 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 24, v41 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v40 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 24, v57 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v57 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v56 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 24, v43 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v43 +; GFX9-NEXT: v_mov_b32_e32 v43, v38 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 24, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v43 +; GFX9-NEXT: v_mov_b32_e32 v43, v48 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v43 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v61 +; GFX9-NEXT: v_mov_b32_e32 v43, v61 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v48 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v43 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 24, v45 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v42 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 24, v62 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v56 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v49 +; GFX9-NEXT: v_mov_b32_e32 v54, v56 +; GFX9-NEXT: v_mov_b32_e32 v38, v58 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v43 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v44 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v44 ; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v59 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v61 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v61 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v60 -; GFX9-NEXT: v_mov_b32_e32 v33, v60 -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v54 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v40 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v52 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v58 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v33 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v61 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v62 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v62 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v61 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v60 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v43 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 24, v44 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v44 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v43 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v43 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v45 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v45 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v44 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v44 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v43 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 24, v44 +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v43 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v44 +; GFX9-NEXT: v_mov_b32_e32 v44, v35 +; GFX9-NEXT: v_mov_b32_e32 v35, v50 +; GFX9-NEXT: v_mov_b32_e32 v39, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v33 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v63 +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v62 +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v62 +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v49, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 24, v63 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 24, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v49 +; GFX9-NEXT: v_mov_b32_e32 v49, v47 +; GFX9-NEXT: v_mov_b32_e32 v47, v45 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v60 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v61 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v61 +; GFX9-NEXT: v_mov_b32_e32 v61, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v60 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v59 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v59 ; GFX9-NEXT: .LBB90_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v39 -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v45 -; GFX9-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v40 -; GFX9-NEXT: v_or_b32_sdwa v10, v10, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v54 -; GFX9-NEXT: v_or_b32_sdwa v11, v11, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v51 -; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v48 -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v57 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v46 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v42 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v52 +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v41 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v40 ; GFX9-NEXT: v_or_b32_sdwa v6, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v62 -; GFX9-NEXT: v_or_b32_sdwa v17, v17, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v60 -; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v41 -; GFX9-NEXT: v_or_b32_sdwa v18, v18, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 -; GFX9-NEXT: v_or_b32_sdwa v12, v12, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; GFX9-NEXT: v_or_b32_sdwa v16, v47, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 -; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 -; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v51 -; GFX9-NEXT: v_or_b32_sdwa v38, v38, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v55 +; GFX9-NEXT: v_or_b32_sdwa v10, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v53 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 +; GFX9-NEXT: v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 +; GFX9-NEXT: v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v35 +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v16, v36, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v33 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v35 +; GFX9-NEXT: v_or_b32_sdwa v35, v37, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v36, v33, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53 -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 -; GFX9-NEXT: v_or_b32_sdwa v15, v46, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v15 +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -159802,16 +159887,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -159821,16 +159904,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -159840,18 +159923,15 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v38 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -159859,16 +159939,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -159878,32 +159958,32 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v35 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v43 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -159912,49 +159992,57 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v50 ; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v43 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v33 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v63 ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v55 -; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v50 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v58 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v58 -; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v63 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v48 ; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 @@ -159962,11 +160050,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -159975,10 +160063,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -159988,23 +160076,21 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -160014,23 +160100,21 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -160040,11 +160124,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -160053,15 +160137,12 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -160131,117 +160212,117 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr108_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr107_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr106_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr104_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr94_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr90_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr180_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr95_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr88_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr75_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr93_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr179_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr72_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr178_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr59_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr73_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr59_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr63_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr182_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr56_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr41_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr89_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr74_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr57_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr61_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr183_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr57_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr104_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr88_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr40_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr95_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr93_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr92_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr75_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr179_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr92_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr89_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr62_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr46_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr40_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr61_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr182_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr177_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr178_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 @@ -160263,106 +160344,106 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 24, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 24, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 24, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v104, 8, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v106, 8, v3 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 8, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 24, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 24, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 8, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 24, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 24, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v178, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v41, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v77, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v89, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v92, 8, v17 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v164.h, v14.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v10.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v8.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.h, v2.h ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.h, v32.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.h, v30.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.h, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.h, v26.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.h, v24.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v22.h ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.h, v20.h ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v18.h ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v164.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v75.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v78.h, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v58.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v88.h, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.h, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v182.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.h, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v176.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v160.h, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v165.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v161.h, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.h, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.h, v10.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v89.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v104.h, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v78.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v77.h, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v95.h, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v93.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v92.h, v16.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.h, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.h, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.h, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.h, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v22.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.h, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.h, v24.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.h, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.h, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.h, v26.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.h, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.h, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.h, v28.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.h, v29.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.h, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.h, v30.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v163.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.h, v32.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.h, v31.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.h, v32.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.h, v32.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.h, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.h, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.h, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.h, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.h, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.h, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.h, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.h, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.h, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.h, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.h, v17.l ; GFX11-TRUE16-NEXT: .LBB90_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB90_4 @@ -160377,14 +160458,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v37, v37, v18, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v70, v37, v39 :: v_dual_add_f32 v33, 0x40c00000, v33 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v80, v37, v39 :: v_dual_add_f32 v33, 0x40c00000, v33 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v33, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 ; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v33, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v17 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v70.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v80.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v55, v36, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_lshlrev_b32 v17, 16, v17 @@ -160398,556 +160479,568 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 ; GFX11-TRUE16-NEXT: v_add3_u32 v18, v48, v34, 0x7fff ; GFX11-TRUE16-NEXT: v_add3_u32 v37, v50, v17, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v71, v37, v51 :: v_dual_lshlrev_b32 v20, 16, v20 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v20, 0x40c00000, v20 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v83, v37, v51 :: v_dual_lshlrev_b32 v20, 16, v20 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v71.h -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v20 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v83.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v18, v49, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v18, 0xffff, v33, v55 ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v20 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 ; GFX11-TRUE16-NEXT: v_bfi_b32 v17, 0xffff, v34, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 24, v18 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v20, 0x7fff +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v89, 8, v18 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v92, 8, v17 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v82, v33, v37 :: v_dual_add_f32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 ; GFX11-TRUE16-NEXT: v_add3_u32 v34, v36, v35, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v20, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 24, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 8, v18 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v19, 0x40c00000, v19 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v81, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v22 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v19, 16, 1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v80, v34, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v70, v34, v38 :: v_dual_add_f32 v19, 0x40c00000, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v19, 0x7fff -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v81.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 8, v17 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v84, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v82.h +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v19 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v19, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v33, v37, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v84.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v87.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v20, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v20, v33, v22, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v22 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v84, v20, v33 :: v_dual_add_f32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v34, v70 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v84.h +; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v35, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v35, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v83, v20, v33, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v34, v80 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v19, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v20 +; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v35, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v71, v19, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v19, 0xffff, v37, v36 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v83.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 24, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 8, v20 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_lshlrev_b32 v24, 16, v24 +; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v22, v71 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v77, 8, v19 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v22, v82 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v19 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v22 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v21, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v33, 0x7fff +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 8, v22 ; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v21, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v86, v34, v37 :: v_dual_and_b32 v37, 0xffff0000, v23 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v97, v34, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v38 :: v_dual_add_f32 v36, 0x40c00000, v36 ; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v24, 16, 1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v24, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v33, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v36, 16, 1 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v35, v38, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v24, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v24 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v23 ; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v86.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v34, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v97.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v96, v34, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v26 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 0x40c00000, v23 :: v_dual_lshlrev_b32 v26, 16, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_lshlrev_b32 v26, 16, v26 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v35, v21 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v37, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v26, 0x40c00000, v26 :: v_dual_cndmask_b32 v81, v33, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 ; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v23, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v33, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v23 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 8, v21 +; GFX11-TRUE16-NEXT: v_add3_u32 v24, v24, v37, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v96.h ; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v23, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v97, v34, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_f32 v34, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v97.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v37, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v100, v34, v36, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_add3_u32 v24, v24, v37, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v24, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v26 +; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v35, v21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v100.h +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v23, v24, v39 :: v_dual_add_f32 v34, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v33, v81 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v26, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfi_b32 v23, 0xffff, v36, v23 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v25 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v87.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v23 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v33, v85 -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 24, v24 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 ; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v26, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v26, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 8, v24 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v98, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v41, 8, v24 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 8, v21 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v99, v33, v37, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v85, v35, v38 :: v_dual_and_b32 v36, 0xffff0000, v25 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_lshlrev_b32 v25, 16, v25 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX11-TRUE16-NEXT: v_bfe_u32 v26, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add3_u32 v26, v26, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v96, v35, v38 :: v_dual_add_f32 v25, 0x40c00000, v25 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v28 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v98.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v25, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v28, 0x40c00000, v28 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v25, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v101, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v99.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v103, v33, v37, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v28, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add3_u32 v25, v25, v35, 0x7fff ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v26, v38, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v26, v33, v28, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v27 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v101.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v100, v26, v33, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v103.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v102, v26, v33, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v34, v85 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v102.h +; GFX11-TRUE16-NEXT: v_add3_u32 v25, v25, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v38 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v34, v96 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v100.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v99, v25, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v26 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v86, v25, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v25, 0xffff, v37, v36 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v30 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v27, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 ; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v27, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v27, 0x40c00000, v27 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v33, 0x7fff +; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v28, v86 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v112, v34, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v27, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v28 +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v27, 0x7fff +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v178, 8, v25 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v113, v34, v37, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v28, v99 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v26 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v35, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v36, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v29 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v30, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v35, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v112.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v28 ; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v30, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v25 -; GFX11-TRUE16-NEXT: v_bfi_b32 v27, 0xffff, v35, v27 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v113.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v112, v34, v38, vcc_lo ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v103, v34, v38 :: v_dual_and_b32 v38, 0xffff0000, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v32 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_lshlrev_b32 v32, 16, v32 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v27 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v29, 16, 1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v102, v33, v39 :: v_dual_add_f32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v30, v37, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v29 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v98, v33, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 ; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v29, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v103.h -; GFX11-TRUE16-NEXT: v_bfe_u32 v30, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v30, v30, v37, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v112.h ; GFX11-TRUE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v113, v34, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v115, v34, v36, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-TRUE16-NEXT: v_add3_u32 v30, v30, v37, 0x7fff -; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v113.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v30, v39, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v27, 0xffff, v35, v27 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v115.h +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v29, v30, v39 :: v_dual_add_f32 v34, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v33, v98 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v27 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfi_b32 v29, 0xffff, v36, v29 ; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v33, v102 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-TRUE16-NEXT: v_bfi_b32 v29, 0xffff, v36, v29 ; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v32 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v31, 0x40c00000, v31 ; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v32, 0x7fff -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v31 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v30 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v115, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_lshlrev_b32 v31, 16, v31 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v29 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v115.h -; GFX11-TRUE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v30 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v114, v35, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v30 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v114, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v31, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 ; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v29 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v101, v35, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v31, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v116, v33, v37 :: v_dual_and_b32 v35, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v114.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v116, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v2, 16, v2 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v116.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v31, v31, v35, 0x7fff ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v32, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v2, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v31, v31, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v32, v33, v2, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v133, v32, v33, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v38 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v32, 0xffff, v34, v114 -; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v132, v31, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_bfi_b32 v32, 0xffff, v34, v101 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v117, v31, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v31, 0xffff, v37, v36 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v4, 16, v4 ; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v1, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 ; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v33, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v133.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 24, v32 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v146, v34, v37 :: v_dual_and_b32 v37, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v1, 0x7fff ; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v133.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v32 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v149, v34, v37, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v132 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v36, 16, 1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v35, v38 :: v_dual_add_f32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v117 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v31 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v35, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v4, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v4 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v4, 0x7fff -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v35.l, v149.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 8, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v147, v34, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 ; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v37, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v148, v34, v38, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v6, 0x40c00000, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v118, v33, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v37, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v37, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v144, v33, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v33.l, v148.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v35.l, v146.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v164, v34, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v38 ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v35, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 8, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v36.l, v164.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 8, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v163, v34, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v36.l, v163.h +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v4, v39 :: v_dual_add_f32 v34, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v36, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v33.l, v147.h +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v33, v144 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v5, 0x40c00000, v5 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v7, 0x40c00000, v7 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 24, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v106, 8, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v33, v118 ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 8, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v32 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 24, v4 ; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v6, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v31 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v165, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v104, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v106, 8, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v160, v33, v37, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v34.l, v165.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v161, v35, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v129, v35, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v8 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v180, v33, v37 :: v_dual_add_f32 v35, 0x40c00000, v35 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v34.l, v160.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v180, v33, v37 :: v_dual_add_f32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v8, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v37.l, v180.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v35, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v8, 16, 1 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v6, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v35, 0x7fff ; GFX11-TRUE16-NEXT: v_add3_u32 v6, v33, v8, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v179, v6, v33, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v176, v6, v33 :: v_dual_and_b32 v39, 0xffff0000, v7 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v34, v161 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v179.h -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v178, v5, v38 :: v_dual_add_f32 v33, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v34, v129 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v176.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v134, v5, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v37, v36 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v10 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v7, 0x7fff +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v7 :: v_dual_lshlrev_b32 v36, 16, v10 ; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v134 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v7, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 ; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v33, 0x7fff -; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v178 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v47, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 24, v8 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v7, 0x7fff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 24, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v43, v35, v37, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 8, v8 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 8, v6 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v34, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v47.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 24, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 8, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v44, v7, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v43.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v182, v7, v37, vcc_lo ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v39 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v9, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v9 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v9, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v10, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v10 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v10, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v41, v35, v38 :: v_dual_lshlrev_b32 v10, 16, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v7 :: v_dual_cndmask_b32 v144, v35, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v35.l, v182.h +; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v9, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v48, v48, v12, 0x7fff ; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v10 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v44.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v35, v144 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v7, 16, 1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v37, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v37 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v35, v41 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v38, v38, v37, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v51 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 24, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v7, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v61, v38, v50 :: v_dual_add_f32 v12, 0x40c00000, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 24, v10 +; GFX11-TRUE16-NEXT: v_add3_u32 v38, v38, v37, 0x7fff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 8, v10 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v42, v38, v50, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v51 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v14 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v61.h -; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v12, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v73, v35, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v42.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v59, v35, v49, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 ; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v48, v48, v12, 0x7fff ; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_lshlrev_b32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 8, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v57, v48, v52, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v146, v48, v52, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v14, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v7, v57 +; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v7, v146 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v36, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v11 ; GFX11-TRUE16-NEXT: v_add3_u32 v11, v35, v37, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v37 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v73.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v59.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 8, v12 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v35, vcc_lo ; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 ; GFX11-TRUE16-NEXT: v_add3_u32 v37, v39, v7, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v13 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v13 ; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v36, v9 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v39 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v89, v37, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v37, v48, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v74, v37, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add3_u32 v37, v48, v35, 0x7fff ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_add3_u32 v39, v49, v14, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v14 -; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v77, v37, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v9 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v164, v37, v38 :: v_dual_and_b32 v37, 0xffff0000, v16 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_add3_u32 v39, v49, v14, 0x7fff ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v7 -; GFX11-TRUE16-NEXT: v_add3_u32 v14, v49, v7, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v16 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v78, v39, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v58, v39, v48, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v13, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] ; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v78.h -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v14, v35 :: v_dual_add_f32 v14, 0x40c00000, v37 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v15 +; GFX11-TRUE16-NEXT: v_add3_u32 v14, v49, v7, 0x7fff +; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v58.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v14, v35, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v37 :: v_dual_lshlrev_b32 v37, 16, v15 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX11-TRUE16-NEXT: v_add3_u32 v35, v39, v13, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v13 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v16, 16, 1 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v16 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v104, v35, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v13, v13, v16, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v37, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v16, 16, 1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v14, 16, 1 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v88, v35, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v13, v13, v16, 0x7fff +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v37 -; GFX11-TRUE16-NEXT: v_add3_u32 v39, v39, v37, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v93, v13, v49, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 ; GFX11-TRUE16-NEXT: v_add3_u32 v35, v48, v14, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v39, v39, v37, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v14 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v75, v13, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 ; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v15, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v15 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v95, v39, v51, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v88.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v78, v39, v51, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v104.h ; GFX11-TRUE16-NEXT: v_add3_u32 v13, v50, v15, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v89.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v38, v77 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v92, v35, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v74.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v38, v164 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v179, v35, v48, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v95.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v93.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v78.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v75.h ; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v39, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 24, v14 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v14 -; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v35, v92 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 8, v14 +; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v35, v179 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] ; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v15, v13 @@ -160966,16 +161059,15 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 8, v7 ; GFX11-TRUE16-NEXT: .LBB90_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v149.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v108.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v133.h @@ -160983,22 +161075,22 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, 0 ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v2.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v68.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v117.h ; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.h, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v106.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v164.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v163.h ; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v1.h, v2.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v105.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v94.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v91.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v104.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v95.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v14 ; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v148.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v147.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v67.l @@ -161007,9 +161099,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v180.h ; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 ; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v90.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v94.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v144.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v118.h ; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v14 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v66.l @@ -161017,22 +161109,22 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v4.l, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v165.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v88.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v160.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v93.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v14 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v10 ; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v5.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v47.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v43.h ; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v76.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v58.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v91.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v63.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v75.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v90.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v161.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v129.h ; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v179.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v72.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v176.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v79.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v6.l, v8.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l @@ -161041,29 +161133,29 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v8.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v14 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v73.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v59.h ; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v7.l, v7.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v178.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v59.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v134.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v72.l ; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v56.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v62.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v16 ; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v8.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v44.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v182.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v43.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v57.l ; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v89.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v74.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v14 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v9.l, v9.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v41.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v42.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v144.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v56.l ; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v14 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v18 @@ -161071,21 +161163,21 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v10.l, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v61.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v183.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v42.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v46.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v16, v14 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v18 ; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v11.l, v12.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v104.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v88.h ; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v176.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v166.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v40.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v181.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v16, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v167.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v183.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v57.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v146.h ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v78.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v58.h ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 @@ -161097,54 +161189,54 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v15.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v16, v14 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v167.l ; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v13.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v77.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v95.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v150.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v164.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v78.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v165.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, v18, v14 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20 ; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v93.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v75.h ; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v149.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v162.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v65, v18, v14 ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v48.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v71.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v79.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v83.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v92.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v16 ; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v15.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v92.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v134.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v179.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v148.l ; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v15.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v18, v14 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v70.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v80.h ; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v15.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v74.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v89.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v53.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, v18, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v46.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v61.l ; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v15.l ; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v63.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v87.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v77.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v15.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v14 ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v62.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v76.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v81.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v60.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v82.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v73.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v17 ; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v16.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v13.l @@ -161153,56 +161245,56 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v13.h, v16.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v20, v14 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v86.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v97.h ; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v80.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v45.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v60.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v19, v14 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v18.h ; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v13.h, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v83.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v40.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v47.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v18.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v19, v14 ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v21.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v38.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v97.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v182.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v100.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v45.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v20 ; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v19.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v82.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v181.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v44.l ; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v13.h, v19.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v14 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v87.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.h ; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v20.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v177.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v41.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v37.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v22, v14 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v147.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v161.l ; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v13.h, v21.l ; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v101.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v163.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v103.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v178.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v21.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v22, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v85.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v162.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v81.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v177.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v98.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v151.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v99.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v166.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v23 ; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v22.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v13.l @@ -161211,40 +161303,40 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.h, v22.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v26, v14 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v112.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v113.h ; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v23.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v145.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v85.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v151.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v25, v14 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v26 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v24.h ; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.h, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v100.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v135.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v102.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v150.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v24.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v25, v14 ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v27.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v35.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v113.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v131.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v115.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v145.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v26 ; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v25.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v99.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v130.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v86.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v135.l ; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v13.h, v25.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v28, v14 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v30 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v103.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v112.h ; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v26.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v129.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v132.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v34.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v28, v14 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v30 @@ -161252,17 +161344,17 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v13.h, v27.l ; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v116.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v128.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v131.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v27.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v102.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.l, 8, v119.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v98.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.l, 8, v130.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v115.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v118.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v114.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v128.l ; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v28.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v13.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -161271,9 +161363,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v13.h, v28.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v32, v14 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v34 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v117.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v119.l ; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v114.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v101.h ; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v14 @@ -161332,128 +161424,132 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v128i8: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x15 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:12 +; GFX11-FAKE16-NEXT: s_clause 0x19 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:12 ; GFX11-FAKE16-NEXT: s_clause 0x2 ; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 @@ -161462,863 +161558,867 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB90_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[29:30] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[11:12] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[85:86], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[98:99], 24, v[3:4] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 24, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 8, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 24, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 8, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 24, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 8, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 24, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 8, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 24, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 8, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 24, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 8, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 24, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 8, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 8, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 24, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v76, 8, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 24, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 16, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 8, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 8, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 24, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 24, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 8, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 24, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 8, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 8, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 24, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 24, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 8, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 24, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 8, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 24, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 8, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[3:4] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[19:20] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[17:18] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[31:32] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[27:28] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 24, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 8, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 8, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v76, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v78, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v88, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v89, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 24, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 8, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 24, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 8, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 8, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 24, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 8, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 8, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 24, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 8, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 8, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 24, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 8, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 8, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 24, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 24, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v77, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v79, 8, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[81:82], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[86:87], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[99:100], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[25:26] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[21:22] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[19:20] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] ; GFX11-FAKE16-NEXT: .LBB90_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB90_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v18 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v17 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v17 +; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v36, 16, 1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v33, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v33 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 ; GFX11-FAKE16-NEXT: v_add3_u32 v17, v17, v33, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v17, v17, v36 :: v_dual_and_b32 v18, 0xffff0000, v18 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v48, v36, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v17, v17, v35 :: v_dual_add_f32 v18, 0x40c00000, v18 ; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v18, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v18 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v18, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v77, v37, v39 :: v_dual_add_f32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v20 -; GFX11-FAKE16-NEXT: v_perm_b32 v69, v77, v17, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v132, v37, v39 :: v_dual_lshlrev_b32 v37, 16, v20 ; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v34 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 24, v69 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_add3_u32 v38, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_perm_b32 v34, v132, v17, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v132 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v35, v38, v18 :: v_dual_add_f32 v18, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v19 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 8, v69 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v38, v18 :: v_dual_add_f32 v20, 0x40c00000, v20 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add3_u32 v36, v48, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v18, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v18 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v36, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v19 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v18, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX11-FAKE16-NEXT: v_perm_b32 v68, v34, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v20, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v35, v37, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v19 :: v_dual_lshlrev_b32 v37, 16, v22 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v20, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v18, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v33, v39 :: v_dual_add_f32 v20, 0x40c00000, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v18 +; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v18, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v19 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v20, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v20 +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v36, v39, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v68 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v68 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v34, v19, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v36, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v38, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v65, v19, v18, 0x7060302 -; GFX11-FAKE16-NEXT: v_add3_u32 v20, v39, v36, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v19 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 24, v65 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v20, v34, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v34 -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v20, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v35, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v20 +; GFX11-FAKE16-NEXT: v_add3_u32 v38, v38, v20, 0x7fff +; GFX11-FAKE16-NEXT: v_perm_b32 v33, v33, v35, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v20, v49, v37, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v22 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v48, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v19, v38, v19 :: v_dual_and_b32 v22, 0xffff0000, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v39, v36, v48, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v20, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v49 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v77, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v79, 8, v33 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v39, v50, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v20, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v20 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v20, 0x7fff +; GFX11-FAKE16-NEXT: v_perm_b32 v38, v19, v18, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v37, v37, v36, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v39, v39, v20, 0x7fff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX11-FAKE16-NEXT: v_perm_b32 v64, v35, v34, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 8, v65 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v37, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v36, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_add_f32 v39, 0x40c00000, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v64 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v64 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[64:65] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 24, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 8, v38 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v39, v50, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_add_f32 v51, 0x40c00000, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 16, v37 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v22, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v22, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v22 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v24 -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v39, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v22, v48, v37, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v39, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v35, v21, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v37 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v51, 16, 1 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v71, v21, v20, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v21 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v35, v22, v35 :: v_dual_add_f32 v22, 0x40c00000, v38 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 +; GFX11-FAKE16-NEXT: v_add3_u32 v49, v49, v22, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 8, v37 +; GFX11-FAKE16-NEXT: v_add3_u32 v39, v39, v51, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v49, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v48, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v48 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v53, v21, v20, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v22, v52, v48, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v51 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 16, v21 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v22, v49, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v50 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v23 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v35 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v22, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v48, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v48 +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v22, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v39, v39, v52 :: v_dual_add_f32 v50, 0x40c00000, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, 0x400000, v22 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v22, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v70, v36, v35, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v23 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v22, v37, v39 :: v_dual_lshlrev_b32 v39, 16, v26 -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v24, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v49, v49, v22, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v52, v39, v48, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v24, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v54, 0x40c00000, v23 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v24 +; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v50, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v49, v51, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v39, v39, v24, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v48, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v24, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v24, v49, v38, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v48 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v48, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v26, 0x40c00000, v26 :: v_dual_cndmask_b32 v23, v36, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v38 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 16, v70 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v23 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v24, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v39 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v25 +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v54, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v24, v55, v50, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v54 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v39, v23, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v50 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-FAKE16-NEXT: v_add3_u32 v51, v49, v54, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 16, v52 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 16, v23 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v24, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v55 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v24, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v51, v64, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v25 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX11-FAKE16-NEXT: v_perm_b32 v81, v23, v22, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v24, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v37, v49, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v24 +; GFX11-FAKE16-NEXT: v_add3_u32 v50, v50, v24, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-FAKE16-NEXT: v_add3_u32 v38, v38, v24, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v80, v37, v36, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v26, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v49, 0x40c00000, v25 +; GFX11-FAKE16-NEXT: v_perm_b32 v69, v39, v49, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v25 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 +; GFX11-FAKE16-NEXT: v_perm_b32 v70, v23, v22, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v50, v54, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v49 +; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v55, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, 0x400000, v55 +; GFX11-FAKE16-NEXT: v_bfe_u32 v64, v51, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 16, v69 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v54, v50, v55, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v26, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v26 -; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v39, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v38, v48, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v26, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v49, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v26, v50, v39, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v49 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v37, v25, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v39 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 -; GFX11-FAKE16-NEXT: v_add3_u32 v38, v38, v49, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v36 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 16, v80 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v37, v26, v37 :: v_dual_add_f32 v26, 0x40c00000, v48 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX11-FAKE16-NEXT: v_add3_u32 v39, v39, v26, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v26, v64, v51, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v28 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v25, v39, v25 :: v_dual_and_b32 v28, 0xffff0000, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v51 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v27 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 16, v25 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v26, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v64 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v26, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v39, v54, v65 :: v_dual_lshlrev_b32 v54, 16, v27 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v26, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v38, v38, v50, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v26 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v26 +; GFX11-FAKE16-NEXT: v_add3_u32 v51, v51, v26, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v83, v39, v50, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v28, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 -; GFX11-FAKE16-NEXT: v_add3_u32 v39, v39, v26, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v82, v38, v37, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v28, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v50, 0x40c00000, v27 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v64, 0x40c00000, v27 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v28 -; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v48, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v39, v49, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v38, v38, v28, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v39, v39, v28, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v65, v54, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v51, v55, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v50, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v28, v51, v48, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, 0x400000, v50 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v38, v27, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v48 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX11-FAKE16-NEXT: v_add3_u32 v39, v39, v50, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v64, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v64 +; GFX11-FAKE16-NEXT: v_add3_u32 v28, v65, v54, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v30 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v39, v27, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v54 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX11-FAKE16-NEXT: v_add3_u32 v55, v51, v64, 0x7fff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX11-FAKE16-NEXT: v_perm_b32 v83, v25, v24, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v38, v28, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v49 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v30, 0x40c00000, v30 :: v_dual_lshlrev_b32 v49, 16, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX11-FAKE16-NEXT: v_perm_b32 v84, v25, v24, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v27 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v51, v28, v39 :: v_dual_add_f32 v28, 0x40c00000, v65 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 24, v84 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v28, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v51, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v28 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 -; GFX11-FAKE16-NEXT: v_add3_u32 v48, v48, v28, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v84, v39, v38, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v54, v28, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v55, v66, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v28 +; GFX11-FAKE16-NEXT: v_add3_u32 v54, v54, v28, 0x7fff +; GFX11-FAKE16-NEXT: v_perm_b32 v96, v39, v51, 0x7060302 ; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v30, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v51, 0x40c00000, v29 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v65, 0x40c00000, v29 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v30 -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v49, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v48, v50, vcc_lo ; GFX11-FAKE16-NEXT: v_add3_u32 v39, v39, v30, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v55, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v54, v64, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v51, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v30, v52, v49, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v54, v65, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v65 +; GFX11-FAKE16-NEXT: v_add3_u32 v30, v66, v55, 0x7fff ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v51 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v39, v29, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v49 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 -; GFX11-FAKE16-NEXT: v_add3_u32 v48, v48, v51, 0x7fff +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v29, v39, v29 :: v_dual_lshlrev_b32 v66, 16, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v55 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX11-FAKE16-NEXT: v_add3_u32 v64, v54, v65, 0x7fff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 24, v83 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v39, v30, v39 :: v_dual_add_f32 v30, 0x40c00000, v50 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v97, v27, v26, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v50 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v30, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v66 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v51 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v30, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v64, v67, vcc_lo ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v31 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v30, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v48, v52, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, 0x400000, v30 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, 0x400000, v30 +; GFX11-FAKE16-NEXT: v_add3_u32 v55, v55, v30, 0x7fff +; GFX11-FAKE16-NEXT: v_perm_b32 v101, v39, v54, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v32, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v64, 0x40c00000, v64 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-FAKE16-NEXT: v_add3_u32 v49, v49, v30, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v86, v48, v39, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v32, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v52, 0x40c00000, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v66, 0x40c00000, v31 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v32 -; GFX11-FAKE16-NEXT: v_bfe_u32 v53, v50, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v49, v51, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v48, v48, v32, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v39, v39, v32, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v67, v64, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v55, v65, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v52, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v53, v50, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v52 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v48, v31, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v50 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-FAKE16-NEXT: v_add3_u32 v49, v49, v52, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v66, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v66 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v67, v64, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v39, v31, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v64 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 +; GFX11-FAKE16-NEXT: v_add3_u32 v65, v55, v66, 0x7fff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-FAKE16-NEXT: v_perm_b32 v85, v27, v26, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v32, v48, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v51 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v51, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v32, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v49, v53, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v32 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 +; GFX11-FAKE16-NEXT: v_perm_b32 v102, v29, v28, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v32, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v67 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[101:102] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[96:97] +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v65, v68, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v64, v32, 16, 1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_perm_b32 v112, v39, v55, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v64, v64, v32, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v65, 0x40c00000, v65 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-FAKE16-NEXT: v_add3_u32 v50, v50, v32, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v96, v49, v48, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v53, 0x40c00000, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v51, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v50, v52, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v49, v49, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[83:84] +; GFX11-FAKE16-NEXT: v_add3_u32 v39, v39, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v71, v65, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v84 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v67, 0x40c00000, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v64, v66, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v53, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v51 -; GFX11-FAKE16-NEXT: v_perm_b32 v87, v29, v28, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v49, v54, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v49, v55, v51, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v32, v53, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v65 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v67, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v39, v68, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v39, v71, v65, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 +; GFX11-FAKE16-NEXT: v_add3_u32 v66, v32, v67, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v71, 0x400000, v67 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v49, v50, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v49, 0x40c00000, v54 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v115, v2, v1, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v39, v64, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v68 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v49, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v52, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v65, v39, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v66, v71, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v49 -; GFX11-FAKE16-NEXT: v_add3_u32 v51, v51, v49, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v98, v50, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v39 +; GFX11-FAKE16-NEXT: v_add3_u32 v65, v65, v39, 0x7fff +; GFX11-FAKE16-NEXT: v_perm_b32 v114, v64, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v64, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v66, 0x40c00000, v66 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_add3_u32 v50, v50, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v52, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v51, v53, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_add3_u32 v64, v64, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v71, v66, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v65, v67, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v52 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 16, v98 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v76, 8, v98 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v50, v54, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v50, v55, v52, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-FAKE16-NEXT: v_add3_u32 v51, v51, v3, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v65, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v66 +; GFX11-FAKE16-NEXT: v_perm_b32 v113, v31, v30, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v76, 24, v115 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v64, v68, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v64, v71, v66, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 +; GFX11-FAKE16-NEXT: v_add3_u32 v65, v65, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v71, 0x400000, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v52, 0x40c00000, v54 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v50, v53, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v66, 0x40c00000, v68 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v64, v67, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v53, 16, v5 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v67, 16, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v52 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v51, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v52, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v66 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v65, v71, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v65, v66, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v67, 0x40c00000, v67 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GFX11-FAKE16-NEXT: v_perm_b32 v100, v3, v50, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v116, v3, v64, 0x7060302 ; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v51, v51, v52, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v53, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v65, v65, v66, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v71, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v80, v67, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v5, 16, 1 ; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v51, v54, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v65, v65, v68, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v66, v53, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v53 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v8 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v80, v67, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v67 +; GFX11-FAKE16-NEXT: v_add3_u32 v66, v66, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v71, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v8 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX11-FAKE16-NEXT: v_perm_b32 v97, v31, v30, 0x7060302 -; GFX11-FAKE16-NEXT: v_perm_b32 v103, v3, v51, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 16, v3 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v53, v6, v54 :: v_dual_add_f32 v6, 0x40c00000, v55 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v32 +; GFX11-FAKE16-NEXT: v_perm_b32 v119, v3, v65, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 16, v3 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v67, v6, v68 :: v_dual_add_f32 v6, 0x40c00000, v71 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v7 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v52, v66, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v71, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v66, v80, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v68, 0x40c00000, v68 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GFX11-FAKE16-NEXT: v_perm_b32 v102, v5, v53, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v118, v5, v67, 0x7060302 ; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v8, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v8 -; GFX11-FAKE16-NEXT: v_bfe_u32 v67, v54, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v99, v2, v1, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v66, v66, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v81, v68, 16, 1 +; GFX11-FAKE16-NEXT: v_perm_b32 v117, v4, v39, 0x7060302 ; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v8, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v52, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v66, v71, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v7, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v67, v54, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v7 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v66, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v10 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v7, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v81, v68, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v71, 0x400000, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v81, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v5, v80 :: v_dual_lshlrev_b32 v80, 16, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68 +; GFX11-FAKE16-NEXT: v_add3_u32 v66, v66, v7, 0x7fff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v8, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v66 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v68, v8, v71, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v80 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_lshlrev_b32 v55, 16, v9 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_lshlrev_b32 v71, 16, v9 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v8 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v52, v67, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v8, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v66, v81, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v71, 0x40c00000, v71 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX11-FAKE16-NEXT: v_perm_b32 v182, v7, v54, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v44, v7, v68, 0x7060302 ; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v10, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v8, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v10 -; GFX11-FAKE16-NEXT: v_bfe_u32 v112, v55, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v183, v5, v6, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v66, v66, v8, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v81, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_bfe_u32 v82, v71, 16, 1 +; GFX11-FAKE16-NEXT: v_perm_b32 v45, v5, v6, 0x7060302 ; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v10, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v52, v66, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v66, v80, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v9, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v10, v112, v55, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v9 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v67, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v12 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v9, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v10, v82, v71, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v81, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 16, v12 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v71, v71 +; GFX11-FAKE16-NEXT: v_add3_u32 v66, v66, v9, 0x7fff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX11-FAKE16-NEXT: v_perm_b32 v101, v4, v49, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 16, v4 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v55, v10, v66 :: v_dual_add_f32 v10, 0x40c00000, v67 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v64 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v26 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v71, v10, v80 :: v_dual_add_f32 v10, 0x40c00000, v81 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v49 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v10, 16, 1 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v52, v112 :: v_dual_lshlrev_b32 v52, 16, v11 -; GFX11-FAKE16-NEXT: v_bfe_u32 v67, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-FAKE16-NEXT: v_bfe_u32 v80, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v66, v82 :: v_dual_lshlrev_b32 v66, 16, v11 +; GFX11-FAKE16-NEXT: v_bfe_u32 v81, v12, 16, 1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v176, v9, v55, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v52 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v66, v10, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v10 -; GFX11-FAKE16-NEXT: v_add3_u32 v67, v67, v12, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v12 -; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_perm_b32 v182, v9, v71, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v66 +; GFX11-FAKE16-NEXT: v_add3_u32 v66, v80, v10, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_add3_u32 v81, v81, v12, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_bfe_u32 v85, v9, 16, 1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 0x40c00000, v11 :: v_dual_cndmask_b32 v10, v52, v66 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 0x40c00000, v11 :: v_dual_cndmask_b32 v10, v66, v80 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v9 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v113, v9, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v177, v7, v8, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v32 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v12, v67, v112 :: v_dual_lshlrev_b32 v67, 16, v14 -; GFX11-FAKE16-NEXT: v_bfe_u32 v112, v11, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_add3_u32 v66, v85, v9, 0x7fff +; GFX11-FAKE16-NEXT: v_perm_b32 v183, v7, v8, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v81, v82, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 16, v14 +; GFX11-FAKE16-NEXT: v_bfe_u32 v82, v11, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v26 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v52, v66 :: v_dual_add_f32 v52, 0x40c00000, v67 -; GFX11-FAKE16-NEXT: v_add3_u32 v66, v112, v11, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v35 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v66, v80 :: v_dual_add_f32 v66, 0x40c00000, v81 +; GFX11-FAKE16-NEXT: v_add3_u32 v80, v82, v11, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v81, 0x400000, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 16, v13 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v52, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v85, v66, 16, 1 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v24 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v66, v67 :: v_dual_add_f32 v66, 0x40c00000, v112 -; GFX11-FAKE16-NEXT: v_add3_u32 v67, v113, v52, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v52 -; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v14, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-FAKE16-NEXT: v_bfe_u32 v114, v66, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[112:113] +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v80, v81 :: v_dual_add_f32 v80, 0x40c00000, v82 +; GFX11-FAKE16-NEXT: v_add3_u32 v81, v85, v66, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v66 +; GFX11-FAKE16-NEXT: v_bfe_u32 v85, v14, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 +; GFX11-FAKE16-NEXT: v_bfe_u32 v86, v80, 16, 1 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v115, 0x400000, v66 -; GFX11-FAKE16-NEXT: v_perm_b32 v162, v11, v9, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v67, v112, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v67, v113, v14, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v14 -; GFX11-FAKE16-NEXT: v_add3_u32 v113, v114, v66, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v87, 0x400000, v80 +; GFX11-FAKE16-NEXT: v_perm_b32 v176, v11, v9, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v66, v81, v82, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v81, v85, v14, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v14 +; GFX11-FAKE16-NEXT: v_add3_u32 v85, v86, v80, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v16 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-FAKE16-NEXT: v_bfe_u32 v116, v13, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v98, v13, 16, 1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v67, v112 :: v_dual_add_f32 v67, 0x40c00000, v114 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 -; GFX11-FAKE16-NEXT: v_add3_u32 v112, v116, v13, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v7 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v81, v82, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v81, 0x40c00000, v86 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v80, v80 +; GFX11-FAKE16-NEXT: v_add3_u32 v82, v98, v13, 0x7fff ; GFX11-FAKE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v149, v14, v52, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v114, v67, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v66, v113, v115, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v113, 0x400000, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 16, v15 +; GFX11-FAKE16-NEXT: v_perm_b32 v163, v14, v66, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v86, v81, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v80, v85, v87, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v85, 0x400000, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 16, v15 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-FAKE16-NEXT: v_perm_b32 v163, v12, v10, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v13, v112, v113 :: v_dual_add_f32 v112, 0x40c00000, v115 -; GFX11-FAKE16-NEXT: v_add3_u32 v113, v114, v67, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v114, 0x400000, v67 -; GFX11-FAKE16-NEXT: v_bfe_u32 v115, v16, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 +; GFX11-FAKE16-NEXT: v_perm_b32 v177, v12, v10, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v13, v82, v85 :: v_dual_add_f32 v82, 0x40c00000, v87 +; GFX11-FAKE16-NEXT: v_add3_u32 v85, v86, v81, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, 0x400000, v81 +; GFX11-FAKE16-NEXT: v_bfe_u32 v87, v16, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX11-FAKE16-NEXT: v_bfe_u32 v116, v112, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v117, 0x400000, v112 -; GFX11-FAKE16-NEXT: v_perm_b32 v148, v13, v66, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v67, v113, v114, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v114, v115, v16, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v115, 0x400000, v16 +; GFX11-FAKE16-NEXT: v_bfe_u32 v98, v82, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v99, 0x400000, v82 +; GFX11-FAKE16-NEXT: v_perm_b32 v162, v13, v80, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v81, v85, v86, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v86, v87, v16, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v87, 0x400000, v16 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 -; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v15, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v116, v116, v112, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v118, 0x400000, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v33 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v114, v115, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v112, v112 -; GFX11-FAKE16-NEXT: v_add3_u32 v113, v113, v15, 0x7fff -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[96:97] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[86:87] -; GFX11-FAKE16-NEXT: v_perm_b32 v135, v16, v67, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v112, v116, v117, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v52 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[84:85] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v5 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v113, v118, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; GFX11-FAKE16-NEXT: v_bfe_u32 v85, v15, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v98, v98, v82, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v100, 0x400000, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v80 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v86, v87, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82 +; GFX11-FAKE16-NEXT: v_add3_u32 v85, v85, v15, 0x7fff ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX11-FAKE16-NEXT: v_perm_b32 v134, v15, v112, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v112 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 16, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v48 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v51 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[134:135] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[148:149] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[162:163] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[176:177] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v53 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v37 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[182:183] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[82:83] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v67 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v66 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v54 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v38 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[102:103] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[98:99] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[80:81] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v55 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v39 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[100:101] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[70:71] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[68:69] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 24, v135 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 8, v135 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v134 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v134 +; GFX11-FAKE16-NEXT: v_perm_b32 v149, v16, v81, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v82, v98, v99, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v81 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v65 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[176:177] +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v85, v100, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[85:86], 24, v[44:45] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[98:99], 24, v[116:117] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v66 +; GFX11-FAKE16-NEXT: v_perm_b32 v148, v15, v82, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v82 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v71 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v68 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[148:149] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v67 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v55 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v54 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[162:163] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[81:82], 24, v[182:183] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[86:87], 24, v[118:119] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[99:100], 24, v[114:115] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[69:70] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[52:53] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[37:38] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 24, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[33:34] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 24, v149 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v149 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v148 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 8, v148 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 24, v163 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 8, v163 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 16, v162 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 16, v162 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v162 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 24, v177 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v177 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 24, v177 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 8, v177 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 16, v176 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 8, v176 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 24, v183 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 8, v183 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 8, v176 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 24, v183 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 8, v183 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v182 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v182 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 24, v103 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 8, v103 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 16, v102 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v102 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 24, v101 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 8, v101 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 16, v100 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 8, v100 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 24, v99 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v99 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 24, v97 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 24, v45 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 8, v45 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 16, v44 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v44 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 24, v119 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 8, v119 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 16, v118 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 8, v118 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 24, v117 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 8, v117 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v116 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v116 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v78, 8, v115 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v88, 16, v114 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v89, 8, v114 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 24, v113 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 8, v113 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v112 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 8, v112 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 24, v102 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 8, v102 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v101 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 24, v97 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 8, v97 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v96 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v96 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 8, v96 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 24, v87 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v87 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v86 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v86 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 24, v85 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 8, v85 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 16, v84 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v84 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v83 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 8, v83 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 16, v82 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 8, v82 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 24, v81 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v81 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v80 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 24, v71 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 8, v71 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 24, v70 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v70 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v77 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 8, v69 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 24, v53 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 8, v53 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 8, v52 ; GFX11-FAKE16-NEXT: .LBB90_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v76 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v37, 8, v89 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v99 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v63 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v75 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v76 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xff, v88 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v75 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v73 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v55, v39 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v73 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v66, v54 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v56 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v37, v34 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v37, 8, v78 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v166 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xff, v180 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v55, v65 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v62 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v37, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v98 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v52 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v65 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v58 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v160 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v65, v66 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v45 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v44 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v66, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v37, 8, v72 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v62 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v34 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v55, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v59 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v58 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v65, v53 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v42 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v147 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v41 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v55, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v161 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v47 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v55, v65 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v181 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v55, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v45 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v43 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v85 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v55, v52 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v180 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v132 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v179 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v39 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v55, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v42 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v146 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v41 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v34 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v183 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v55, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v181 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v81 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v54, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v167 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v177 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v165 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v68, 8, v179 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v54, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v52 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v51 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xff, v119 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v164 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v55, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xff, v133 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v178 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v162 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v161 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v50, 8, v50 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v176 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v167 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v68, 8, v80 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v150 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v118 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v149 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v51, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, v54, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v65, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v80, 8, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v81, 0xff, v131 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v163 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v37, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v66, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v80 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v81, v82 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v37 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55 ; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v10, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v10, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v6, 8, v148 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v145 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v8, 8, v49 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v6, 8, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v151 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v8, 8, v67 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v10, 8, v144 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v116 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v12, 8, v135 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v10, 8, v150 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v129 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v12, 8, v149 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v14, 8, v134 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v14, 8, v148 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v131 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v145 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v64 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v130 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v129 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v103 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v135 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v74 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v72 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v79 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v77 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v33, 8, v33 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v39, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v34, v33 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 @@ -162335,35 +162435,35 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v61 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v69 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v59 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v74 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v132 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v38 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v57 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v47 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v63 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v61 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v36 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v46 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v60 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v146 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v160 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v57 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v70 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v40 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v46 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v65 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xff, v133 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v182 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v147 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v44 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v38, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v33, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 @@ -162380,35 +162480,35 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v178 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v40 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v54 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v128 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v176 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v134 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v182 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v82 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v83 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v163 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v177 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v50 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v117 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v151 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v130 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v165 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v84 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v103 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v35, 8, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v119 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v49 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v36, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v33, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 @@ -162425,31 +162525,31 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v115 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v102 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v128 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v118 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v29 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v86 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v101 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v34 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v48 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v87 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v102 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v113 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v71 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v116 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v96 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v99 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v115 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v35 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v97 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v112 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v98 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v113 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v114 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v26 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v27, v28 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v29, v30 @@ -162477,29 +162577,33 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x15 -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:96 +; GFX11-FAKE16-NEXT: s_clause 0x19 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:112 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -162560,464 +162664,484 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v61, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v8 ; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v10 ; SI-NEXT: v_mul_f32_e32 v63, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19 ; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v28 ; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v29 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s19 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v29 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s23 ; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 ; SI-NEXT: v_mul_f32_e64 v10, 1.0, s25 ; SI-NEXT: v_mul_f32_e64 v13, 1.0, s24 ; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s28 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v32 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v34 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v35 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; SI-NEXT: v_mul_f32_e32 v35, 1.0, v36 ; SI-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v38 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v39 ; SI-NEXT: v_mul_f32_e32 v38, 1.0, v48 ; SI-NEXT: v_mul_f32_e32 v48, 1.0, v49 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mul_f32_e32 v28, 1.0, v50 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v51 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(10) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v51 ; SI-NEXT: v_mul_f32_e32 v50, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v55 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v54 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v40 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v41 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v42 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v41 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v42 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v43 -; SI-NEXT: v_mul_f32_e64 v39, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v49, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s29 -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v43 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v42, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s27 +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB91_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mov_b32_e32 v43, v36 -; SI-NEXT: v_alignbit_b32 v36, v1, v2, 16 +; SI-NEXT: v_alignbit_b32 v36, v1, v3, 16 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_alignbit_b32 v6, v1, v6, 16 +; SI-NEXT: v_alignbit_b32 v33, v1, v6, 16 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_alignbit_b32 v2, v1, v13, 16 +; SI-NEXT: v_mov_b32_e32 v41, v28 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v28, v1, v13, 16 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 -; SI-NEXT: v_alignbit_b32 v5, v1, v17, 16 +; SI-NEXT: v_mov_b32_e32 v40, v25 +; SI-NEXT: v_alignbit_b32 v25, v1, v19, 16 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_alignbit_b32 v4, v1, v3, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 -; SI-NEXT: v_alignbit_b32 v3, v1, v7, 16 +; SI-NEXT: v_alignbit_b32 v53, v1, v44, 16 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47 -; SI-NEXT: v_alignbit_b32 v16, v1, v57, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v13, v1, v58, 16 +; SI-NEXT: v_alignbit_b32 v19, v1, v7, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v57 +; SI-NEXT: v_alignbit_b32 v57, v1, v58, 16 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_alignbit_b32 v10, v1, v60, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; SI-NEXT: v_alignbit_b32 v44, v19, v8, 16 -; SI-NEXT: v_alignbit_b32 v7, v1, v22, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 -; SI-NEXT: v_alignbit_b32 v8, v44, v36, 24 -; SI-NEXT: v_alignbit_b32 v60, v1, v27, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v29 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v13, v1, v56, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v31 +; SI-NEXT: v_alignbit_b32 v10, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 +; SI-NEXT: v_alignbit_b32 v7, v1, v23, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v44, v36, 16 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v39 -; SI-NEXT: v_alignbit_b32 v57, v1, v30, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v35 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v15, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_alignbit_b32 v44, v2, v8, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v44, v36, 8 -; SI-NEXT: v_alignbit_b32 v58, v22, v9, 16 -; SI-NEXT: v_alignbit_b32 v40, v1, v37, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v44, v36, 24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v58, v6, 24 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v44, v36, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v58, v6, 16 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v49 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_alignbit_b32 v58, v2, v9, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v58, v6, 8 -; SI-NEXT: v_alignbit_b32 v47, v25, v12, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v58, v33, 24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v47, v2, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v58, v33, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v47, v2, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v58, v33, 8 +; SI-NEXT: v_alignbit_b32 v47, v23, v12, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v2, v47, v28, 24 +; SI-NEXT: v_alignbit_b32 v6, v1, v27, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v2, v47, v28, 16 +; SI-NEXT: v_alignbit_b32 v5, v1, v32, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v35 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v47, v2, 8 +; SI-NEXT: v_alignbit_b32 v2, v47, v28, 8 +; SI-NEXT: v_alignbit_b32 v4, v1, v37, 16 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v38 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v46 -; SI-NEXT: v_alignbit_b32 v53, v1, v48, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_alignbit_b32 v3, v1, v48, 16 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v50 -; SI-NEXT: v_alignbit_b32 v50, v8, v59, 16 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v50, v2, v46, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v50, v5, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v50, v25, 24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v50, v5, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v50, v25, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v50, v5, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v52, v1, v52, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 -; SI-NEXT: v_mov_b32_e32 v17, v63 -; SI-NEXT: v_alignbit_b32 v1, v1, v41, 16 +; SI-NEXT: v_alignbit_b32 v2, v50, v25, 8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v56, v1, v51, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v52 +; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; SI-NEXT: v_alignbit_b32 v35, v22, v17, 16 +; SI-NEXT: v_alignbit_b32 v8, v35, v13, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v35, v13, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v17, v40 +; SI-NEXT: v_mov_b32_e32 v40, v29 +; SI-NEXT: v_mov_b32_e32 v32, v49 ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v31 -; SI-NEXT: v_alignbit_b32 v62, v8, v61, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v52, v44, v36, 8 +; SI-NEXT: v_alignbit_b32 v46, v35, v13, 8 +; SI-NEXT: v_mov_b32_e32 v48, v34 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v11 +; SI-NEXT: v_alignbit_b32 v62, v2, v61, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v62, v4, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v62, v53, 24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v62, v4, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v62, v53, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v56 -; SI-NEXT: v_alignbit_b32 v55, v8, v63, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v62, v53, 8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v55, v3, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v60 +; SI-NEXT: v_alignbit_b32 v55, v2, v63, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v55, v3, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v55, v19, 24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v55, v3, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v48, v62, v4, 8 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v31 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 -; SI-NEXT: v_alignbit_b32 v38, v8, v45, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v55, v19, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v38, v16, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v55, v19, 8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_alignbit_b32 v35, v8, v18, 16 -; SI-NEXT: v_mov_b32_e32 v45, v8 -; SI-NEXT: v_alignbit_b32 v8, v35, v13, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v11 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v35, v13, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v29, v35, v13, 8 -; SI-NEXT: v_alignbit_b32 v61, v38, v16, 24 -; SI-NEXT: v_alignbit_b32 v41, v38, v16, 16 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v62 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 ; SI-NEXT: v_alignbit_b32 v30, v8, v21, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v8, v30, v10, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v8, v30, v10, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v8, v30, v10, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v23 -; SI-NEXT: v_alignbit_b32 v27, v8, v24, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v24 +; SI-NEXT: v_alignbit_b32 v27, v8, v26, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v8, v27, v7, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v8, v27, v7, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v8, v27, v7, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v26, 8, v44 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_alignbit_b32 v38, v51, v2, 16 +; SI-NEXT: v_alignbit_b32 v2, v38, v57, 24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v20 -; SI-NEXT: v_alignbit_b32 v24, v8, v26, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v38, v57, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v24, v60, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v38, v57, 8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v24, v60, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; SI-NEXT: v_alignbit_b32 v37, v8, v18, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v32 -; SI-NEXT: v_alignbit_b32 v21, v8, v14, 16 +; SI-NEXT: v_alignbit_b32 v8, v37, v6, 24 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v21, v57, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v37, v6, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v37, v6, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v21, v57, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v17 +; SI-NEXT: v_alignbit_b32 v21, v8, v29, 16 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v21, v5, 24 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v21, v57, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v21, v5, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v34 -; SI-NEXT: v_alignbit_b32 v18, v8, v15, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v21, v5, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v29 +; SI-NEXT: v_alignbit_b32 v61, v8, v34, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v18, v40, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v61, v4, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v18, v40, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v8, v18, v40, 8 +; SI-NEXT: v_alignbit_b32 v8, v61, v4, 16 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v28 -; SI-NEXT: v_alignbit_b32 v63, v8, v51, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v61, v4, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v63, v53, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v41 +; SI-NEXT: v_alignbit_b32 v63, v8, v49, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v63, v53, 16 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v33 +; SI-NEXT: v_alignbit_b32 v8, v63, v3, 24 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v63, v53, 8 -; SI-NEXT: v_alignbit_b32 v12, v40, v43, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v63, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v31 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v12, v52, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v63, v3, 8 +; SI-NEXT: v_alignbit_b32 v12, v49, v54, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v12, v52, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v12, v56, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v12, v52, 8 +; SI-NEXT: v_alignbit_b32 v8, v12, v56, 16 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v42 -; SI-NEXT: v_mov_b32_e32 v15, v9 -; SI-NEXT: v_alignbit_b32 v9, v8, v54, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v12, v56, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v9, v1, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v39 +; SI-NEXT: v_alignbit_b32 v9, v8, v43, 16 +; SI-NEXT: v_mov_b32_e32 v43, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v24 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v9, v1, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v2 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v9, v1, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v37 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, v37 -; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v49 -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v43 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v46 -; SI-NEXT: v_lshrrev_b32_e32 v46, 24, v56 -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v32 -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v8 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v9, v1, 24 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v34 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v21 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v38 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v9, v1, 16 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v28 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v29 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v15 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v9, v1, 8 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v33 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v61 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v59 -; SI-NEXT: v_lshrrev_b32_e32 v20, 24, v20 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v42 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v12 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v41 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v30 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v59 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v24 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v63 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v42 -; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v39 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v60 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v23 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v31 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v20, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v21 -; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v18 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v45 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v9 -; SI-NEXT: v_alignbit_b32 v26, v24, v60, 16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 8, v44 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v12 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v58 -; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v47 -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v50 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v15 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v62 -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v39 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v55 -; SI-NEXT: v_lshrrev_b32_e32 v15, 8, v35 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v20 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v27 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, v28 -; SI-NEXT: v_mov_b32_e32 v23, v48 +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v18 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v63 -; SI-NEXT: v_mov_b32_e32 v48, v33 -; SI-NEXT: v_mov_b32_e32 v34, v53 -; SI-NEXT: v_mov_b32_e32 v53, v42 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v9 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v24, v51 +; SI-NEXT: v_mov_b32_e32 v51, v2 +; SI-NEXT: v_mov_b32_e32 v2, v22 +; SI-NEXT: v_mov_b32_e32 v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v58 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v47 +; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v50 +; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v55 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v38 +; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v35 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v30 +; SI-NEXT: v_lshrrev_b32_e32 v18, 8, v27 +; SI-NEXT: v_mov_b32_e32 v15, v43 +; SI-NEXT: v_mov_b32_e32 v43, v39 ; SI-NEXT: s_branch .LBB91_3 ; SI-NEXT: .LBB91_2: -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v40, v29 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -163160,139 +163284,116 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: v_mov_b32_e32 v53, v42 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v48, v33 -; SI-NEXT: v_mov_b32_e32 v29, v28 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: v_mov_b32_e32 v17, v63 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v43, v39 +; SI-NEXT: v_mov_b32_e32 v32, v49 +; SI-NEXT: v_mov_b32_e32 v48, v34 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v15, v25 ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 ; SI-NEXT: .LBB91_3: ; %Flow -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v56, v17 -; SI-NEXT: v_mov_b32_e32 v54, v61 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v42, v32 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB91_5 ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v44 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_f32_e32 v46, 0x40c00000, v44 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_alignbit_b32 v52, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v56, v3, v2, 16 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 @@ -163303,941 +163404,960 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v34, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v33 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v51, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_alignbit_b32 v57, v7, v5, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v26 +; SI-NEXT: v_alignbit_b32 v5, v7, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 ; SI-NEXT: v_alignbit_b32 v9, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v32 ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 -; SI-NEXT: v_alignbit_b32 v12, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v11 +; SI-NEXT: v_alignbit_b32 v12, v59, v7, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v11 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v60, v10, v6, 16 +; SI-NEXT: v_alignbit_b32 v6, v10, v6, 16 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_alignbit_b32 v7, v13, v7, 16 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v14 ; SI-NEXT: v_alignbit_b32 v63, v13, v10, 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v48 ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v14 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v10, v14, v10, 16 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v32 -; SI-NEXT: v_alignbit_b32 v18, v14, v13, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v21, v15, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v10, v16, v10, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_alignbit_b32 v61, v16, v13, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v40 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v41 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v13, v16, v13, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v23 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v17 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_alignbit_b32 v13, v19, v13, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_alignbit_b32 v21, v19, v16, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v16, v19, v16, 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v24, v15, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v31 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v37, v2, v19, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v20 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v24 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v27, v15, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v3, v22, v19, 16 -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v54, 0x40c00000, v22 +; SI-NEXT: v_alignbit_b32 v57, v22, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v37 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v54 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v59, 0x40c00000, v44 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v59 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v30, v15, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v21 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v61 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v63 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v12 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v9 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v46 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v39 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v27, v2, v22, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_alignbit_b32 v4, v25, v22, 16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v19, v25, v19, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v40, 0x40c00000, v25 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v40 -; SI-NEXT: v_alignbit_b32 v35, v45, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v49 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v30, v2, v25, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_alignbit_b32 v5, v28, v25, 16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v28 +; SI-NEXT: v_alignbit_b32 v53, v28, v22, 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v51, 0x40c00000, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_alignbit_b32 v35, v2, v28, 16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_alignbit_b32 v25, v32, v25, 16 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v52 +; SI-NEXT: v_alignbit_b32 v38, v24, v32, 16 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v38 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v41 -; SI-NEXT: v_alignbit_b32 v38, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v56 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_alignbit_b32 v2, v33, v28, 16 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v33 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v28, v33, v28, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v54, 0x40c00000, v33 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v54 +; SI-NEXT: v_alignbit_b32 v55, v18, v32, 16 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v43 -; SI-NEXT: v_alignbit_b32 v55, v61, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; SI-NEXT: v_alignbit_b32 v6, v36, v33, 16 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v33, v36, v33, 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v36 -; SI-NEXT: v_add_f32_e32 v46, 0x40c00000, v36 -; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v46 +; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v36 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v41 +; SI-NEXT: v_alignbit_b32 v62, v15, v32, 16 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; SI-NEXT: v_alignbit_b32 v62, v15, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v39 -; SI-NEXT: v_alignbit_b32 v36, v39, v36, 16 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v39 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v42 -; SI-NEXT: v_alignbit_b32 v50, v17, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v39 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v56 -; SI-NEXT: v_alignbit_b32 v47, v25, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v39 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v39 -; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v50 -; SI-NEXT: v_alignbit_b32 v58, v22, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v56 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v42 -; SI-NEXT: v_lshrrev_b32_e32 v42, 8, v63 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v46 -; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v55 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v18, 24, v41 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v43 -; SI-NEXT: v_alignbit_b32 v43, v38, v16, 8 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 24, v54 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v41 -; SI-NEXT: v_alignbit_b32 v41, v38, v16, 16 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 24, v52 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v40 -; SI-NEXT: v_mov_b32_e32 v40, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v54 -; SI-NEXT: v_alignbit_b32 v54, v38, v16, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 24, v51 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v20 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v20, v35, v13, 8 +; SI-NEXT: v_lshrrev_b32_e32 v18, 24, v49 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 8, v35 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 24, v39 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v23 -; SI-NEXT: v_alignbit_b32 v23, v62, v4, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 24, v31 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v49 -; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v47 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 24, v20 +; SI-NEXT: v_mov_b32_e32 v49, v59 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v62 +; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v55 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v18 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 8, v27 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_alignbit_b32 v36, v48, v36, 16 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v40, 0x40c00000, v48 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v40 +; SI-NEXT: v_alignbit_b32 v50, v15, v32, 16 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v62 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v46 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v46, v35, v13, 8 +; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v50 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v48 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v42 +; SI-NEXT: v_alignbit_b32 v47, v22, v32, 16 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v48 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v43 +; SI-NEXT: v_alignbit_b32 v58, v23, v32, 16 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v58 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v43 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v26 -; SI-NEXT: v_alignbit_b32 v26, v24, v60, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v44, v19, v14, 16 -; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v59 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v42 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v40 +; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v58 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_alignbit_b32 v44, v16, v32, 16 ; SI-NEXT: v_alignbit_b32 v8, v44, v36, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v8, v44, v36, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v58, v33, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v44, v36, 8 +; SI-NEXT: v_alignbit_b32 v8, v58, v33, 16 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v58, v6, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v58, v33, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v58, v6, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v47, v28, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v58, v6, 8 +; SI-NEXT: v_alignbit_b32 v8, v47, v28, 16 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v47, v2, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v47, v28, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v47, v2, 16 +; SI-NEXT: v_alignbit_b32 v8, v50, v25, 24 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v47, v2, 8 +; SI-NEXT: v_alignbit_b32 v8, v50, v25, 16 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v50, v5, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v50, v25, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v50, v5, 16 +; SI-NEXT: v_alignbit_b32 v8, v62, v53, 24 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v50, v5, 8 +; SI-NEXT: v_alignbit_b32 v8, v62, v53, 16 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v62, v4, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v62, v53, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v62, v4, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v55, v19, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v55, v3, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v55, v19, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v55, v3, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v55, v19, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v55, v3, 8 +; SI-NEXT: v_alignbit_b32 v8, v38, v57, 24 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v38, v57, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v38, v57, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v8, v35, v13, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v8, v35, v13, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v8, v30, v10, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v8, v30, v10, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v8, v30, v10, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v8, v27, v7, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v8, v27, v7, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v8, v27, v7, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v24, v60, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v37, v6, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v24, v60, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v37, v6, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v21, v57, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v37, v6, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v21, v57, 16 +; SI-NEXT: v_alignbit_b32 v8, v21, v5, 24 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v21, v57, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v18, v51, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v21, v5, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v18, v51, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v8, v18, v51, 8 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v21, v5, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v12 -; SI-NEXT: v_lshrrev_b32_e32 v51, 8, v44 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v61, v4, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v21 +; SI-NEXT: v_alignbit_b32 v8, v61, v4, 16 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v63, v34, 24 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v9 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v61, v4, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v63, v34, 16 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v63, v3, 24 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v63, v34, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v63, v3, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v12, v52, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v63, v3, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v12, v52, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v12, v56, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v12, v52, 8 +; SI-NEXT: v_alignbit_b32 v8, v12, v56, 16 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v12, v56, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v8, v9, v1, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v8, v9, v1, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v8, v9, v1, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v38 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v30 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v52, v44, v36, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v26, 8, v44 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v27 +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v47 ; SI-NEXT: .LBB91_5: ; %end -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v36, 0xff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v52 +; SI-NEXT: v_or_b32_e32 v36, v36, v52 +; SI-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v26 +; SI-NEXT: v_and_b32_e32 v33, 0xff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v31, 8, v34 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v28 -; SI-NEXT: v_or_b32_e32 v32, v36, v32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v36, 0xff, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; SI-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v52, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v29 -; SI-NEXT: v_or_b32_e32 v36, v56, v36 -; SI-NEXT: v_or_b32_e32 v32, v32, v36 -; SI-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v32, 0xff, v44 -; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v51 -; SI-NEXT: v_or_b32_e32 v32, v32, v36 -; SI-NEXT: v_and_b32_e32 v36, 0xff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; SI-NEXT: v_or_b32_e32 v14, v14, v36 -; SI-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v14, v32, v14 -; SI-NEXT: v_add_i32_e32 v32, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v14, v32, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v17 +; SI-NEXT: v_or_b32_e32 v51, v51, v52 +; SI-NEXT: v_or_b32_e32 v36, v36, v51 +; SI-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v14, v14, v32 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v36, 0xff, v44 +; SI-NEXT: v_or_b32_e32 v36, v36, v39 +; SI-NEXT: v_and_b32_e32 v39, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v36, 0xffff, v36 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v32, 0xff, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v15 +; SI-NEXT: v_or_b32_e32 v39, v51, v39 +; SI-NEXT: v_or_b32_e32 v36, v36, v39 +; SI-NEXT: v_add_i32_e32 v39, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; SI-NEXT: v_or_b32_e32 v17, v33, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v19 -; SI-NEXT: v_or_b32_e32 v32, v33, v32 -; SI-NEXT: v_or_b32_e32 v14, v14, v32 -; SI-NEXT: v_add_i32_e32 v32, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v14, v32, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v58 -; SI-NEXT: v_or_b32_e32 v11, v14, v11 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v33, 0xff, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v36, 24, v26 +; SI-NEXT: v_or_b32_e32 v33, v36, v33 +; SI-NEXT: v_or_b32_e32 v17, v17, v33 +; SI-NEXT: v_add_i32_e32 v33, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v17, v33, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v58 +; SI-NEXT: v_or_b32_e32 v17, v17, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v6 -; SI-NEXT: v_or_b32_e32 v14, v32, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v15 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_or_b32_e32 v17, v17, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v17, v31, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v2 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v60 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v19 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v28, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v17, v28 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xff, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v23 +; SI-NEXT: v_or_b32_e32 v28, v31, v28 +; SI-NEXT: v_or_b32_e32 v17, v17, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v17, v28, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v47 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v49 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v47 +; SI-NEXT: v_or_b32_e32 v8, v17, v8 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v2 -; SI-NEXT: v_or_b32_e32 v14, v28, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v15 +; SI-NEXT: v_or_b32_e32 v17, v28, v17 +; SI-NEXT: v_or_b32_e32 v8, v8, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v8, v17, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v57 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; SI-NEXT: v_or_b32_e32 v8, v8, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v25, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v25, v17 +; SI-NEXT: v_or_b32_e32 v8, v8, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v8, v17, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v39 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v59 +; SI-NEXT: v_or_b32_e32 v8, v8, v17 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v2 -; SI-NEXT: v_or_b32_e32 v14, v25, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v23 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v17 -; SI-NEXT: v_or_b32_e32 v14, v22, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v15 +; SI-NEXT: v_or_b32_e32 v17, v25, v17 +; SI-NEXT: v_or_b32_e32 v8, v8, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v8, v17, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v62 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v31 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v53 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; SI-NEXT: v_or_b32_e32 v8, v8, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v17 -; SI-NEXT: v_or_b32_e32 v14, v22, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v22, v17 +; SI-NEXT: v_or_b32_e32 v8, v8, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v8, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v62 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v19, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v55 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v46 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v61 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v17 -; SI-NEXT: v_or_b32_e32 v14, v19, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v43 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v41 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v54 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 +; SI-NEXT: v_or_b32_e32 v11, v17, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v38 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v19 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v2 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v20 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v17, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v20 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v15 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v45 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 +; SI-NEXT: v_or_b32_e32 v11, v17, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v57 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v16, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v30 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v2 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 +; SI-NEXT: v_or_b32_e32 v11, v16, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v46 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; SI-NEXT: v_or_b32_e32 v7, v7, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v45 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v2 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v2 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_or_b32_e32 v7, v7, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v14 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v2 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v2 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v18 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v2 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v2 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v24 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v2 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v2 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v21 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v2 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v2 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v52 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v2 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v18 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v2 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v2 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v2 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v63 -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v42 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v2 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v2 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v56 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -164248,14 +164368,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v40 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 @@ -164265,12 +164385,12 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -164281,10 +164401,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 @@ -164322,8 +164442,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: v_writelane_b32 v63, s30, 0 ; VI-NEXT: v_writelane_b32 v63, s31, 1 @@ -165128,26 +165248,26 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 ; VI-NEXT: v_alignbit_b32 v31, v31, v33, 16 ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill @@ -165158,126 +165278,129 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[3:4] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[1:2] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v32 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v4 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v4 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v3 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v2 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v12 -; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v8 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 ; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[23:24] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v16 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v12 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v2 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 ; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v16 ; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v13 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v12 ; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v7 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v5 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v18 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v16 ; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 ; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v15 ; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v14 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 ; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v14 ; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v13 ; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill @@ -165290,23 +165413,21 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v10 ; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v9 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v3 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v38, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v4 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v1 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; VI-NEXT: s_branch .LBB91_5 ; VI-NEXT: .LBB91_3: ; VI-NEXT: ; implicit-def: $sgpr46 @@ -165466,23 +165587,29 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_branch .LBB91_2 ; VI-NEXT: .LBB91_4: ; VI-NEXT: v_mov_b32_e32 v33, s71 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v33, s69 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v33, s70 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v33, s68 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v33, s67 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s86 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s83 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s66 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s64 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s65 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s54 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s53 ; VI-NEXT: v_mov_b32_e32 v31, s4 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s82 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s86 ; VI-NEXT: v_readlane_b32 s4, v62, 0 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v33, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 1 ; VI-NEXT: v_mov_b32_e32 v40, s4 @@ -165514,171 +165641,170 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_readlane_b32 s4, v62, 13 ; VI-NEXT: v_mov_b32_e32 v46, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 14 -; VI-NEXT: v_mov_b32_e32 v50, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 15 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v33, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 15 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 16 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v33, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 17 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v33, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 18 +; VI-NEXT: v_mov_b32_e32 v36, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 19 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 19 -; VI-NEXT: v_mov_b32_e32 v55, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 20 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_mov_b32_e32 v52, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 21 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_mov_b32_e32 v49, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 22 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v49, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 23 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_mov_b32_e32 v50, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 24 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v50, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 25 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v50, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 26 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v50, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 27 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v50, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 28 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v50, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 29 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v50, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 30 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v50, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 31 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v50, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 32 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v50, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 33 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_mov_b32_e32 v41, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 34 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v50, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 35 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 36 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 37 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 38 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 39 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 40 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 41 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 42 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 43 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 44 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 45 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 46 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 47 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 48 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 49 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 50 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 51 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 52 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 53 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 54 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 55 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 56 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 57 -; VI-NEXT: v_mov_b32_e32 v42, s54 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s4 +; VI-NEXT: v_mov_b32_e32 v42, s51 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v41, s46 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v41, s56 -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v41, s58 ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s58 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v41, s60 ; VI-NEXT: v_mov_b32_e32 v45, s72 ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v45, s74 -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v45, s76 -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v45, s78 ; VI-NEXT: v_mov_b32_e32 v55, s88 ; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v36, s66 -; VI-NEXT: v_mov_b32_e32 v52, s64 -; VI-NEXT: v_mov_b32_e32 v55, v50 -; VI-NEXT: v_mov_b32_e32 v35, s30 -; VI-NEXT: v_mov_b32_e32 v59, s87 -; VI-NEXT: v_mov_b32_e32 v58, s34 -; VI-NEXT: v_mov_b32_e32 v45, s36 +; VI-NEXT: v_mov_b32_e32 v61, s50 +; VI-NEXT: v_mov_b32_e32 v58, s83 +; VI-NEXT: v_mov_b32_e32 v55, v48 +; VI-NEXT: v_mov_b32_e32 v48, v47 +; VI-NEXT: v_mov_b32_e32 v57, s30 +; VI-NEXT: v_mov_b32_e32 v35, s83 +; VI-NEXT: v_mov_b32_e32 v60, s34 +; VI-NEXT: v_mov_b32_e32 v42, s36 ; VI-NEXT: v_mov_b32_e32 v34, s38 ; VI-NEXT: v_mov_b32_e32 v1, s44 ; VI-NEXT: v_mov_b32_e32 v2, s45 @@ -165711,44 +165837,44 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_mov_b32_e32 v29, s28 ; VI-NEXT: v_mov_b32_e32 v30, s29 ; VI-NEXT: v_mov_b32_e32 v32, s5 +; VI-NEXT: v_mov_b32_e32 v38, s87 +; VI-NEXT: v_mov_b32_e32 v37, s82 ; VI-NEXT: v_mov_b32_e32 v41, s62 -; VI-NEXT: v_mov_b32_e32 v57, s81 -; VI-NEXT: v_mov_b32_e32 v37, s84 -; VI-NEXT: v_mov_b32_e32 v60, s52 -; VI-NEXT: v_mov_b32_e32 v38, s51 -; VI-NEXT: v_mov_b32_e32 v61, s65 -; VI-NEXT: v_mov_b32_e32 v49, s66 -; VI-NEXT: v_mov_b32_e32 v39, s55 +; VI-NEXT: v_mov_b32_e32 v59, s84 +; VI-NEXT: v_mov_b32_e32 v39, s51 ; VI-NEXT: v_mov_b32_e32 v50, v46 -; VI-NEXT: v_mov_b32_e32 v46, v48 -; VI-NEXT: v_mov_b32_e32 v48, v47 ; VI-NEXT: v_mov_b32_e32 v47, v56 ; VI-NEXT: v_mov_b32_e32 v56, v51 ; VI-NEXT: v_mov_b32_e32 v51, s90 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s85 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v34, s48 ; VI-NEXT: v_mov_b32_e32 v51, v53 ; VI-NEXT: v_mov_b32_e32 v53, v54 ; VI-NEXT: v_mov_b32_e32 v54, v40 ; VI-NEXT: v_mov_b32_e32 v40, s80 -; VI-NEXT: v_mov_b32_e32 v58, s50 -; VI-NEXT: v_mov_b32_e32 v45, s53 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v57, s81 +; VI-NEXT: v_mov_b32_e32 v58, s85 +; VI-NEXT: v_mov_b32_e32 v60, s50 +; VI-NEXT: v_mov_b32_e32 v61, s52 +; VI-NEXT: v_mov_b32_e32 v42, s55 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; VI-NEXT: .LBB91_5: ; %end -; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v33 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v36 ; VI-NEXT: v_or_b32_sdwa v17, v17, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v52 +; VI-NEXT: v_or_b32_sdwa v18, v18, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_readlane_b32 s87, v63, 31 ; VI-NEXT: v_readlane_b32 s86, v63, 30 ; VI-NEXT: v_readlane_b32 s85, v63, 29 @@ -165781,44 +165907,35 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_readlane_b32 s34, v63, 2 ; VI-NEXT: v_readlane_b32 s31, v63, 1 ; VI-NEXT: v_readlane_b32 s30, v63, 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v33 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v18, v18, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v33 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v45 ; VI-NEXT: v_or_b32_sdwa v34, v33, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v17, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v49 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v33, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v18, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v18, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -165829,23 +165946,23 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v18, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; VI-NEXT: v_or_b32_sdwa v18, v21, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v18, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -165856,20 +165973,20 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v18, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v41 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; VI-NEXT: v_or_b32_sdwa v18, v23, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v18, vcc, 24, v0 ; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -165882,21 +165999,21 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; VI-NEXT: v_or_b32_sdwa v18, v25, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v18, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -165907,23 +166024,23 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v18, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; VI-NEXT: v_or_b32_sdwa v18, v27, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v18, vcc, 40, v0 ; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -165934,23 +166051,23 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v18, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; VI-NEXT: v_or_b32_sdwa v18, v29, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v18, vcc, 48, v0 ; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -165961,23 +166078,23 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v18, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; VI-NEXT: v_or_b32_sdwa v18, v31, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v18, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -165988,90 +166105,95 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v18, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v17, vcc, 64, v0 ; VI-NEXT: buffer_store_dword v1, v17, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v49 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v61 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v42 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v45 -; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v60 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v59 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v61 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v38 ; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v35 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v58 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v37 ; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v57 ; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -166125,22 +166247,24 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v48 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v43 ; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 ; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -166166,8 +166290,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -166230,8 +166354,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: v_readfirstlane_b32 s59, v14 ; GFX9-NEXT: v_readfirstlane_b32 s56, v15 ; GFX9-NEXT: v_readfirstlane_b32 s57, v16 -; GFX9-NEXT: v_readfirstlane_b32 s46, v17 -; GFX9-NEXT: v_readfirstlane_b32 s47, v18 +; GFX9-NEXT: v_readfirstlane_b32 s44, v17 +; GFX9-NEXT: v_readfirstlane_b32 s45, v18 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 @@ -166346,51 +166470,51 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: v_writelane_b32 v62, s41, 3 ; GFX9-NEXT: s_lshr_b64 s[40:41], s[22:23], 24 ; GFX9-NEXT: v_writelane_b32 v62, s40, 0 -; GFX9-NEXT: s_lshr_b32 s70, s47, 24 -; GFX9-NEXT: s_lshr_b32 s15, s47, 16 -; GFX9-NEXT: s_lshr_b32 s7, s47, 8 -; GFX9-NEXT: s_lshr_b32 s53, s46, 16 -; GFX9-NEXT: s_lshr_b32 s52, s46, 8 -; GFX9-NEXT: s_lshr_b32 s67, s57, 24 +; GFX9-NEXT: s_lshr_b32 s53, s45, 24 +; GFX9-NEXT: s_lshr_b32 s15, s45, 16 +; GFX9-NEXT: s_lshr_b32 s70, s45, 8 +; GFX9-NEXT: s_lshr_b32 s7, s44, 16 +; GFX9-NEXT: s_lshr_b32 s6, s44, 8 +; GFX9-NEXT: s_lshr_b32 s65, s57, 24 ; GFX9-NEXT: s_lshr_b32 s14, s57, 16 -; GFX9-NEXT: s_lshr_b32 s69, s57, 8 -; GFX9-NEXT: s_lshr_b32 s6, s56, 16 -; GFX9-NEXT: s_lshr_b32 s71, s56, 8 -; GFX9-NEXT: s_lshr_b32 s64, s59, 24 +; GFX9-NEXT: s_lshr_b32 s68, s57, 8 +; GFX9-NEXT: s_lshr_b32 s69, s56, 16 +; GFX9-NEXT: s_lshr_b32 s38, s56, 8 +; GFX9-NEXT: s_lshr_b32 s54, s59, 24 ; GFX9-NEXT: s_lshr_b32 s13, s59, 16 -; GFX9-NEXT: s_lshr_b32 s66, s59, 8 -; GFX9-NEXT: s_lshr_b32 s51, s58, 16 -; GFX9-NEXT: s_lshr_b32 s68, s58, 8 -; GFX9-NEXT: s_lshr_b32 s99, s61, 24 +; GFX9-NEXT: s_lshr_b32 s52, s59, 8 +; GFX9-NEXT: s_lshr_b32 s67, s58, 16 +; GFX9-NEXT: s_lshr_b32 s66, s58, 8 +; GFX9-NEXT: s_lshr_b32 s97, s61, 24 ; GFX9-NEXT: s_lshr_b32 s12, s61, 16 -; GFX9-NEXT: s_lshr_b32 s55, s61, 8 -; GFX9-NEXT: s_lshr_b32 s50, s60, 16 -; GFX9-NEXT: s_lshr_b32 s65, s60, 8 -; GFX9-NEXT: s_lshr_b32 s96, s63, 24 +; GFX9-NEXT: s_lshr_b32 s51, s61, 8 +; GFX9-NEXT: s_lshr_b32 s64, s60, 16 +; GFX9-NEXT: s_lshr_b32 s55, s60, 8 +; GFX9-NEXT: s_lshr_b32 s86, s63, 24 ; GFX9-NEXT: s_lshr_b32 s11, s63, 16 -; GFX9-NEXT: s_lshr_b32 s98, s63, 8 -; GFX9-NEXT: s_lshr_b32 s49, s62, 16 -; GFX9-NEXT: s_lshr_b32 s54, s62, 8 -; GFX9-NEXT: s_lshr_b32 s85, s73, 24 +; GFX9-NEXT: s_lshr_b32 s50, s63, 8 +; GFX9-NEXT: s_lshr_b32 s99, s62, 16 +; GFX9-NEXT: s_lshr_b32 s98, s62, 8 +; GFX9-NEXT: s_lshr_b32 s83, s73, 24 ; GFX9-NEXT: s_lshr_b32 s10, s73, 16 -; GFX9-NEXT: s_lshr_b32 s87, s73, 8 -; GFX9-NEXT: s_lshr_b32 s48, s72, 16 -; GFX9-NEXT: s_lshr_b32 s97, s72, 8 -; GFX9-NEXT: s_lshr_b32 s82, s75, 24 +; GFX9-NEXT: s_lshr_b32 s49, s73, 8 +; GFX9-NEXT: s_lshr_b32 s96, s72, 16 +; GFX9-NEXT: s_lshr_b32 s87, s72, 8 +; GFX9-NEXT: s_lshr_b32 s80, s75, 24 ; GFX9-NEXT: s_lshr_b32 s9, s75, 16 -; GFX9-NEXT: s_lshr_b32 s84, s75, 8 -; GFX9-NEXT: s_lshr_b32 s39, s74, 16 -; GFX9-NEXT: s_lshr_b32 s86, s74, 8 -; GFX9-NEXT: s_lshr_b32 s80, s77, 24 +; GFX9-NEXT: s_lshr_b32 s48, s75, 8 +; GFX9-NEXT: s_lshr_b32 s85, s74, 16 +; GFX9-NEXT: s_lshr_b32 s84, s74, 8 +; GFX9-NEXT: s_lshr_b32 s71, s77, 24 ; GFX9-NEXT: s_lshr_b32 s8, s77, 16 -; GFX9-NEXT: s_lshr_b32 s81, s77, 8 -; GFX9-NEXT: s_lshr_b32 s38, s76, 16 -; GFX9-NEXT: s_lshr_b32 s83, s76, 8 +; GFX9-NEXT: s_lshr_b32 s39, s77, 8 +; GFX9-NEXT: s_lshr_b32 s82, s76, 16 +; GFX9-NEXT: s_lshr_b32 s81, s76, 8 ; GFX9-NEXT: v_writelane_b32 v62, s41, 1 ; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 ; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 -; GFX9-NEXT: s_lshr_b64 s[78:79], s[46:47], 24 +; GFX9-NEXT: s_lshr_b64 s[46:47], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 ; GFX9-NEXT: s_lshr_b64 s[88:89], s[56:57], 24 ; GFX9-NEXT: s_lshr_b64 s[90:91], s[58:59], 24 ; GFX9-NEXT: s_lshr_b64 s[92:93], s[60:61], 24 @@ -166401,698 +166525,697 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_cbranch_execnz .LBB91_4 ; GFX9-NEXT: .LBB91_2: ; %cmp.true ; GFX9-NEXT: s_and_b32 s6, s77, 0xffff0000 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x40c00000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s6, v5 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX9-NEXT: s_lshl_b32 s6, s77, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v1, s6, v5 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: s_and_b32 s6, s76, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v8, v5, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_lshl_or_b32 v2, v4, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v1, s6, v5 +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v1 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_lshl_b32 s6, s76, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s6, v5 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX9-NEXT: s_and_b32 s6, s75, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v7, v2, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s6, v5 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc ; GFX9-NEXT: s_lshl_b32 s6, s75, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v31 -; GFX9-NEXT: s_and_b32 s6, s74, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v14, v5, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_lshl_b32 s6, s74, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s6, v5 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v33 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; GFX9-NEXT: s_and_b32 s6, s74, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v4, v7, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s6, v5 +; GFX9-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v3 +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_lshl_b32 s6, s74, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v6 +; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v34 ; GFX9-NEXT: s_and_b32 s6, s73, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v13, v2, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v6 +; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc ; GFX9-NEXT: s_lshl_b32 s6, s73, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v6 +; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v35 ; GFX9-NEXT: s_and_b32 s6, s72, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v16, v32, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_lshl_or_b32 v10, v33, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v6 +; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_lshl_b32 s6, s72, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v36 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX9-NEXT: v_add_f32_e32 v7, s6, v5 +; GFX9-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v8, v8, v7 +; GFX9-NEXT: v_add_u32_e32 v8, 0x7fff, v8 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v37 ; GFX9-NEXT: s_and_b32 s6, s63, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v15, v2, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshl_or_b32 v9, v6, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v6 +; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc ; GFX9-NEXT: s_lshl_b32 s6, s63, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v6 +; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v38 ; GFX9-NEXT: s_and_b32 s6, s62, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v18, v35, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_lshl_or_b32 v16, v36, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v6 +; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_lshl_b32 s6, s62, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v39 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX9-NEXT: v_add_f32_e32 v7, s6, v5 +; GFX9-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v8, v8, v7 +; GFX9-NEXT: v_add_u32_e32 v8, 0x7fff, v8 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v48 ; GFX9-NEXT: s_and_b32 s6, s61, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v17, v2, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshl_or_b32 v15, v6, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v6 +; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc ; GFX9-NEXT: s_lshl_b32 s6, s61, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v6 +; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v49 ; GFX9-NEXT: s_and_b32 s6, s60, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v20, v38, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_lshl_or_b32 v21, v39, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v6 +; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_lshl_b32 s6, s60, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v50 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX9-NEXT: v_add_f32_e32 v7, s6, v5 +; GFX9-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v8, v8, v7 +; GFX9-NEXT: v_add_u32_e32 v8, 0x7fff, v8 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v51 ; GFX9-NEXT: s_and_b32 s6, s59, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v19, v2, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshl_or_b32 v20, v6, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v6 +; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc ; GFX9-NEXT: s_lshl_b32 s6, s59, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v6 +; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v52 ; GFX9-NEXT: s_and_b32 s6, s58, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v22, v49, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v6 +; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_lshl_b32 s6, s58, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v53 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX9-NEXT: v_add_f32_e32 v7, s6, v5 +; GFX9-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v8, v8, v7 +; GFX9-NEXT: v_add_u32_e32 v8, 0x7fff, v8 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v54 ; GFX9-NEXT: s_and_b32 s6, s57, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v21, v2, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshl_or_b32 v22, v6, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v6 +; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc ; GFX9-NEXT: s_lshl_b32 s6, s57, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v6 +; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v55 ; GFX9-NEXT: s_and_b32 s6, s56, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v24, v52, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_lshl_or_b32 v25, v53, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v6 +; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_lshl_b32 s6, s56, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v40 -; GFX9-NEXT: s_and_b32 s6, s47, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v23, v2, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: s_lshl_b32 s6, s47, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v41 -; GFX9-NEXT: s_and_b32 s6, s46, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_lshl_b32 s6, s46, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v42 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX9-NEXT: v_add_f32_e32 v7, s6, v5 +; GFX9-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v8, v8, v7 +; GFX9-NEXT: v_add_u32_e32 v8, 0x7fff, v8 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v41 +; GFX9-NEXT: s_and_b32 s6, s45, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v24, v6, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v6 +; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX9-NEXT: s_lshl_b32 s6, s45, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v6 +; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v42 +; GFX9-NEXT: s_and_b32 s6, s44, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v27, v40, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v6 +; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: s_lshl_b32 s6, s44, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX9-NEXT: v_add_f32_e32 v7, s6, v5 +; GFX9-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v8, v8, v7 +; GFX9-NEXT: v_add_u32_e32 v8, 0x7fff, v8 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v43 ; GFX9-NEXT: s_and_b32 s6, s17, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v25, v2, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_lshl_or_b32 v26, v6, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s7, s7, s6 ; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff ; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s9, s8 ; GFX9-NEXT: s_lshr_b32 s11, s6, 16 ; GFX9-NEXT: s_lshl_b32 s6, s17, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s7, s7, s6 ; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff ; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s9, s8 ; GFX9-NEXT: s_lshr_b32 s17, s6, 16 ; GFX9-NEXT: s_and_b32 s6, s16, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s7, s7, s6 ; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff ; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s9, s8 ; GFX9-NEXT: s_lshr_b32 s8, s6, 16 ; GFX9-NEXT: s_lshl_b32 s6, s16, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s7, s7, s6 ; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff ; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s10, s9 ; GFX9-NEXT: s_lshr_b32 s16, s6, 16 ; GFX9-NEXT: s_and_b32 s6, s19, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s46, s16, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s44, s16, s8 ; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff ; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s9, s8 ; GFX9-NEXT: s_lshr_b32 s12, s6, 16 ; GFX9-NEXT: s_lshl_b32 s6, s19, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s7, s7, s6 ; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff ; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s9, s8 ; GFX9-NEXT: s_lshr_b32 s19, s6, 16 ; GFX9-NEXT: s_and_b32 s6, s18, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s7, s7, s6 ; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff ; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s9, s8 ; GFX9-NEXT: s_lshr_b32 s8, s6, 16 ; GFX9-NEXT: s_lshl_b32 s6, s18, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s7, s7, s6 ; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff ; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s10, s9 ; GFX9-NEXT: s_lshr_b32 s18, s6, 16 ; GFX9-NEXT: s_and_b32 s6, s21, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s7, s7, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s56, s18, s8 ; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff ; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s9, s8 ; GFX9-NEXT: s_lshr_b32 s13, s6, 16 ; GFX9-NEXT: s_lshl_b32 s6, s21, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s7, s7, s6 ; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff ; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s9, s8 ; GFX9-NEXT: s_lshr_b32 s21, s6, 16 ; GFX9-NEXT: s_and_b32 s6, s20, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s7, s7, s6 ; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff ; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s9, s8 ; GFX9-NEXT: s_lshr_b32 s8, s6, 16 ; GFX9-NEXT: s_lshl_b32 s6, s20, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s7, s7, s6 ; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff ; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s10, s9 ; GFX9-NEXT: s_lshr_b32 s20, s6, 16 ; GFX9-NEXT: s_and_b32 s6, s23, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s7, s7, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s58, s20, s8 ; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff ; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s9, s8 ; GFX9-NEXT: s_lshr_b32 s14, s6, 16 ; GFX9-NEXT: s_lshl_b32 s6, s23, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s7, s7, s6 ; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff ; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s9, s8 ; GFX9-NEXT: s_lshr_b32 s23, s6, 16 ; GFX9-NEXT: s_and_b32 s6, s22, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s7, s7, s6 ; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff ; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s9, s8 ; GFX9-NEXT: s_lshr_b32 s8, s6, 16 ; GFX9-NEXT: s_lshl_b32 s6, s22, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s7, s7, s6 ; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff ; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s10, s9 ; GFX9-NEXT: s_lshr_b32 s22, s6, 16 ; GFX9-NEXT: s_and_b32 s6, s25, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s7, s7, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s60, s22, s8 ; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff ; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s9, s8 ; GFX9-NEXT: s_lshr_b32 s15, s6, 16 ; GFX9-NEXT: s_lshl_b32 s6, s25, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s7, s7, s6 ; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff ; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s9, s8 ; GFX9-NEXT: s_lshr_b32 s25, s6, 16 ; GFX9-NEXT: s_and_b32 s6, s24, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s7, s7, s6 ; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff ; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s9, s8 ; GFX9-NEXT: s_lshr_b32 s8, s6, 16 ; GFX9-NEXT: s_lshl_b32 s6, s24, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s7, s7, s6 ; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff ; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s10, s9 ; GFX9-NEXT: s_lshr_b32 s24, s6, 16 ; GFX9-NEXT: s_and_b32 s6, s27, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s7, s7, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s62, s24, s8 ; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff ; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s9, s8 ; GFX9-NEXT: s_lshr_b32 s76, s6, 16 ; GFX9-NEXT: s_lshl_b32 s6, s27, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s7, s7, s6 ; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff ; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s9, s8 ; GFX9-NEXT: s_lshr_b32 s27, s6, 16 ; GFX9-NEXT: s_and_b32 s6, s26, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s7, s7, s6 ; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff ; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s9, s8 ; GFX9-NEXT: s_lshr_b32 s8, s6, 16 ; GFX9-NEXT: s_lshl_b32 s6, s26, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s7, s7, s6 ; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff ; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s10, s9 ; GFX9-NEXT: s_lshr_b32 s26, s6, 16 ; GFX9-NEXT: s_and_b32 s6, s29, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s7, s7, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s72, s26, s8 ; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff ; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s9, s8 ; GFX9-NEXT: s_lshr_b32 s77, s6, 16 ; GFX9-NEXT: s_lshl_b32 s6, s29, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s7, s7, s6 ; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff ; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s9, s8 ; GFX9-NEXT: s_lshr_b32 s29, s6, 16 ; GFX9-NEXT: s_and_b32 s6, s28, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s7, s7, s6 ; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff ; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s9, s8 ; GFX9-NEXT: s_lshr_b32 s8, s6, 16 ; GFX9-NEXT: s_lshl_b32 s6, s28, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s7, s7, s6 ; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff ; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s10, s9 ; GFX9-NEXT: s_lshr_b32 s28, s6, 16 ; GFX9-NEXT: s_and_b32 s6, s5, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s7, s7, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s74, s28, s8 ; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff ; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s9, s8 ; GFX9-NEXT: s_lshl_b32 s5, s5, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s5, v1 -; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: v_add_f32_e32 v6, s5, v5 +; GFX9-NEXT: v_readfirstlane_b32 s5, v6 ; GFX9-NEXT: s_lshr_b32 s78, s6, 16 ; GFX9-NEXT: s_bfe_u32 s6, s5, 0x10010 ; GFX9-NEXT: s_add_i32 s6, s6, s5 ; GFX9-NEXT: s_add_i32 s8, s6, 0x7fff ; GFX9-NEXT: s_bitset1_b32 s5, 22 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: s_cselect_b32 s5, s5, s8 ; GFX9-NEXT: s_and_b32 s6, s4, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_add_f32_e32 v6, s6, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 ; GFX9-NEXT: s_bfe_u32 s8, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s8, s8, s6 ; GFX9-NEXT: s_lshr_b32 s5, s5, 16 ; GFX9-NEXT: s_add_i32 s10, s8, 0x7fff ; GFX9-NEXT: s_bitset1_b32 s6, 22 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec ; GFX9-NEXT: s_cselect_b32 s6, s6, s10 ; GFX9-NEXT: s_lshl_b32 s4, s4, 16 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v1 -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: v_add_f32_e32 v5, s4, v5 +; GFX9-NEXT: v_readfirstlane_b32 s4, v5 ; GFX9-NEXT: s_bfe_u32 s8, s4, 0x10010 ; GFX9-NEXT: s_add_i32 s8, s8, s4 ; GFX9-NEXT: s_lshr_b32 s6, s6, 16 ; GFX9-NEXT: s_add_i32 s10, s8, 0x7fff ; GFX9-NEXT: s_bitset1_b32 s4, 22 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_lshrrev_b64 v[1:2], 24, v[25:26] +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec -; GFX9-NEXT: v_lshrrev_b64 v[2:3], 24, v[23:24] ; GFX9-NEXT: s_cselect_b32 s4, s4, s10 -; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[21:22] -; GFX9-NEXT: v_lshrrev_b64 v[9:10], 24, v[15:16] -; GFX9-NEXT: s_pack_ll_b32_b16 s47, s17, s11 +; GFX9-NEXT: v_lshrrev_b64 v[5:6], 24, v[26:27] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[20:21] +; GFX9-NEXT: s_pack_ll_b32_b16 s45, s17, s11 ; GFX9-NEXT: s_pack_ll_b32_b16 s57, s19, s12 ; GFX9-NEXT: s_pack_ll_b32_b16 s59, s21, s13 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NEXT: v_lshrrev_b64 v[4:5], 24, v[19:20] -; GFX9-NEXT: v_lshrrev_b64 v[10:11], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b64 v[6:7], 24, v[24:25] +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[15:16] +; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[3:4] ; GFX9-NEXT: s_pack_ll_b32_b16 s61, s23, s14 ; GFX9-NEXT: s_pack_ll_b32_b16 s63, s25, s15 ; GFX9-NEXT: s_pack_ll_b32_b16 s73, s27, s76 @@ -167101,9 +167224,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s4, s6 ; GFX9-NEXT: s_lshr_b64 s[40:41], s[58:59], 24 ; GFX9-NEXT: s_lshr_b64 s[42:43], s[56:57], 24 -; GFX9-NEXT: s_lshr_b64 s[44:45], s[46:47], 24 -; GFX9-NEXT: v_lshrrev_b64 v[5:6], 24, v[17:18] -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[7:8] +; GFX9-NEXT: s_lshr_b64 s[46:47], s[44:45], 24 +; GFX9-NEXT: v_lshrrev_b64 v[7:8], 24, v[22:23] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[1:2] ; GFX9-NEXT: s_lshr_b64 s[34:35], s[6:7], 24 ; GFX9-NEXT: s_lshr_b64 s[36:37], s[74:75], 24 ; GFX9-NEXT: s_lshr_b64 s[38:39], s[72:73], 24 @@ -167113,7 +167237,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_lshr_b32 s10, s7, 8 ; GFX9-NEXT: s_lshr_b32 s41, s6, 16 ; GFX9-NEXT: s_lshr_b32 s43, s6, 8 -; GFX9-NEXT: s_lshr_b32 s45, s75, 24 +; GFX9-NEXT: s_lshr_b32 s47, s75, 24 ; GFX9-NEXT: s_lshr_b32 s75, s75, 8 ; GFX9-NEXT: s_lshr_b32 s79, s74, 16 ; GFX9-NEXT: s_lshr_b32 s74, s74, 8 @@ -167137,42 +167261,42 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_lshr_b32 s57, s57, 8 ; GFX9-NEXT: s_lshr_b32 vcc_hi, s56, 16 ; GFX9-NEXT: s_lshr_b32 s56, s56, 8 -; GFX9-NEXT: s_lshr_b32 s30, s47, 24 -; GFX9-NEXT: s_lshr_b32 s47, s47, 8 -; GFX9-NEXT: s_lshr_b32 s8, s46, 16 -; GFX9-NEXT: s_lshr_b32 s7, s46, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 8, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; GFX9-NEXT: s_lshr_b32 s30, s45, 24 +; GFX9-NEXT: s_lshr_b32 s45, s45, 8 +; GFX9-NEXT: s_lshr_b32 s8, s44, 16 +; GFX9-NEXT: s_lshr_b32 s7, s44, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 8, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 8, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v25 ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v24 ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 24, v23 ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 8, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 24, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v22 ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 8, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 24, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 24, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 8, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 8, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 24, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 8, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GFX9-NEXT: s_branch .LBB91_5 ; GFX9-NEXT: .LBB91_3: ; GFX9-NEXT: ; implicit-def: $sgpr6 @@ -167187,46 +167311,46 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: ; implicit-def: $sgpr6 ; GFX9-NEXT: ; kill: killed $sgpr6 ; GFX9-NEXT: ; implicit-def: $sgpr78 -; GFX9-NEXT: ; implicit-def: $sgpr83 -; GFX9-NEXT: ; implicit-def: $sgpr38 ; GFX9-NEXT: ; implicit-def: $sgpr81 -; GFX9-NEXT: ; implicit-def: $sgpr8 -; GFX9-NEXT: ; implicit-def: $sgpr80 -; GFX9-NEXT: ; implicit-def: $sgpr86 +; GFX9-NEXT: ; implicit-def: $sgpr82 ; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr71 ; GFX9-NEXT: ; implicit-def: $sgpr84 -; GFX9-NEXT: ; implicit-def: $sgpr9 -; GFX9-NEXT: ; implicit-def: $sgpr82 -; GFX9-NEXT: ; implicit-def: $sgpr97 +; GFX9-NEXT: ; implicit-def: $sgpr85 ; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr9 +; GFX9-NEXT: ; implicit-def: $sgpr80 ; GFX9-NEXT: ; implicit-def: $sgpr87 -; GFX9-NEXT: ; implicit-def: $sgpr10 -; GFX9-NEXT: ; implicit-def: $sgpr85 -; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr96 ; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr83 ; GFX9-NEXT: ; implicit-def: $sgpr98 -; GFX9-NEXT: ; implicit-def: $sgpr11 -; GFX9-NEXT: ; implicit-def: $sgpr96 -; GFX9-NEXT: ; implicit-def: $sgpr65 +; GFX9-NEXT: ; implicit-def: $sgpr99 ; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr86 ; GFX9-NEXT: ; implicit-def: $sgpr55 -; GFX9-NEXT: ; implicit-def: $sgpr12 -; GFX9-NEXT: ; implicit-def: $sgpr99 -; GFX9-NEXT: ; implicit-def: $sgpr68 +; GFX9-NEXT: ; implicit-def: $sgpr64 ; GFX9-NEXT: ; implicit-def: $sgpr51 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr97 ; GFX9-NEXT: ; implicit-def: $sgpr66 +; GFX9-NEXT: ; implicit-def: $sgpr67 +; GFX9-NEXT: ; implicit-def: $sgpr52 ; GFX9-NEXT: ; implicit-def: $sgpr13 -; GFX9-NEXT: ; implicit-def: $sgpr64 -; GFX9-NEXT: ; implicit-def: $sgpr71 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr38 ; GFX9-NEXT: ; implicit-def: $sgpr69 +; GFX9-NEXT: ; implicit-def: $sgpr68 ; GFX9-NEXT: ; implicit-def: $sgpr14 -; GFX9-NEXT: ; implicit-def: $sgpr67 -; GFX9-NEXT: ; implicit-def: $sgpr52 -; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr65 ; GFX9-NEXT: ; implicit-def: $sgpr7 -; GFX9-NEXT: ; implicit-def: $sgpr15 ; GFX9-NEXT: ; implicit-def: $sgpr70 -; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr46 ; GFX9-NEXT: ; implicit-def: $sgpr42 ; GFX9-NEXT: ; implicit-def: $sgpr40 ; GFX9-NEXT: ; implicit-def: $sgpr36 @@ -167330,72 +167454,72 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: v_mov_b32_e32 v1, s77 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: v_mov_b32_e32 v46, s51 -; GFX9-NEXT: v_mov_b32_e32 v56, s50 -; GFX9-NEXT: v_mov_b32_e32 v58, s49 -; GFX9-NEXT: v_mov_b32_e32 v60, s48 -; GFX9-NEXT: v_mov_b32_e32 v27, s39 -; GFX9-NEXT: v_mov_b32_e32 v29, s38 -; GFX9-NEXT: v_mov_b32_e32 v10, s34 -; GFX9-NEXT: v_mov_b32_e32 v11, s36 +; GFX9-NEXT: v_mov_b32_e32 v24, s38 +; GFX9-NEXT: v_mov_b32_e32 v21, s51 +; GFX9-NEXT: v_mov_b32_e32 v16, s50 +; GFX9-NEXT: v_mov_b32_e32 v10, s49 +; GFX9-NEXT: v_mov_b32_e32 v4, s48 +; GFX9-NEXT: v_mov_b32_e32 v2, s39 +; GFX9-NEXT: v_mov_b32_e32 v17, s34 +; GFX9-NEXT: v_mov_b32_e32 v18, s36 ; GFX9-NEXT: v_readlane_b32 s34, v62, 8 ; GFX9-NEXT: v_readlane_b32 s36, v62, 6 ; GFX9-NEXT: v_readlane_b32 s38, v62, 4 ; GFX9-NEXT: v_readlane_b32 s48, v62, 2 ; GFX9-NEXT: v_readlane_b32 s50, v62, 0 -; GFX9-NEXT: v_mov_b32_e32 v42, s46 -; GFX9-NEXT: v_mov_b32_e32 v41, s47 -; GFX9-NEXT: v_mov_b32_e32 v55, s15 -; GFX9-NEXT: v_mov_b32_e32 v40, s56 -; GFX9-NEXT: v_mov_b32_e32 v54, s57 -; GFX9-NEXT: v_mov_b32_e32 v52, s14 -; GFX9-NEXT: v_mov_b32_e32 v53, s58 -; GFX9-NEXT: v_mov_b32_e32 v51, s59 -; GFX9-NEXT: v_mov_b32_e32 v49, s13 -; GFX9-NEXT: v_mov_b32_e32 v50, s60 -; GFX9-NEXT: v_mov_b32_e32 v48, s61 -; GFX9-NEXT: v_mov_b32_e32 v38, s12 -; GFX9-NEXT: v_mov_b32_e32 v39, s62 -; GFX9-NEXT: v_mov_b32_e32 v37, s63 -; GFX9-NEXT: v_mov_b32_e32 v35, s11 -; GFX9-NEXT: v_mov_b32_e32 v36, s72 -; GFX9-NEXT: v_mov_b32_e32 v34, s73 -; GFX9-NEXT: v_mov_b32_e32 v32, s10 -; GFX9-NEXT: v_mov_b32_e32 v33, s74 -; GFX9-NEXT: v_mov_b32_e32 v31, s75 +; GFX9-NEXT: v_mov_b32_e32 v43, s44 +; GFX9-NEXT: v_mov_b32_e32 v42, s45 +; GFX9-NEXT: v_mov_b32_e32 v40, s15 +; GFX9-NEXT: v_mov_b32_e32 v41, s56 +; GFX9-NEXT: v_mov_b32_e32 v55, s57 +; GFX9-NEXT: v_mov_b32_e32 v53, s14 +; GFX9-NEXT: v_mov_b32_e32 v54, s58 +; GFX9-NEXT: v_mov_b32_e32 v52, s59 +; GFX9-NEXT: v_mov_b32_e32 v50, s13 +; GFX9-NEXT: v_mov_b32_e32 v51, s60 +; GFX9-NEXT: v_mov_b32_e32 v49, s61 +; GFX9-NEXT: v_mov_b32_e32 v39, s12 +; GFX9-NEXT: v_mov_b32_e32 v48, s62 +; GFX9-NEXT: v_mov_b32_e32 v38, s63 +; GFX9-NEXT: v_mov_b32_e32 v36, s11 +; GFX9-NEXT: v_mov_b32_e32 v37, s72 +; GFX9-NEXT: v_mov_b32_e32 v35, s73 +; GFX9-NEXT: v_mov_b32_e32 v33, s10 +; GFX9-NEXT: v_mov_b32_e32 v34, s74 +; GFX9-NEXT: v_mov_b32_e32 v32, s75 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v26, s53 -; GFX9-NEXT: v_mov_b32_e32 v25, s52 -; GFX9-NEXT: v_mov_b32_e32 v6, s70 -; GFX9-NEXT: v_mov_b32_e32 v12, s7 -; GFX9-NEXT: v_mov_b32_e32 v44, s6 -; GFX9-NEXT: v_mov_b32_e32 v23, s71 -; GFX9-NEXT: v_mov_b32_e32 v43, s67 -; GFX9-NEXT: v_mov_b32_e32 v24, s69 -; GFX9-NEXT: v_mov_b32_e32 v21, s68 -; GFX9-NEXT: v_mov_b32_e32 v45, s64 +; GFX9-NEXT: v_mov_b32_e32 v19, s7 +; GFX9-NEXT: v_mov_b32_e32 v26, s6 +; GFX9-NEXT: v_mov_b32_e32 v8, s53 +; GFX9-NEXT: v_mov_b32_e32 v14, s70 +; GFX9-NEXT: v_mov_b32_e32 v44, s69 +; GFX9-NEXT: v_mov_b32_e32 v27, s65 +; GFX9-NEXT: v_mov_b32_e32 v25, s68 +; GFX9-NEXT: v_mov_b32_e32 v46, s67 ; GFX9-NEXT: v_mov_b32_e32 v22, s66 -; GFX9-NEXT: v_mov_b32_e32 v19, s65 -; GFX9-NEXT: v_mov_b32_e32 v47, s99 +; GFX9-NEXT: v_mov_b32_e32 v45, s54 +; GFX9-NEXT: v_mov_b32_e32 v23, s52 +; GFX9-NEXT: v_mov_b32_e32 v56, s64 ; GFX9-NEXT: v_mov_b32_e32 v20, s55 -; GFX9-NEXT: v_mov_b32_e32 v17, s54 -; GFX9-NEXT: v_mov_b32_e32 v57, s96 -; GFX9-NEXT: v_mov_b32_e32 v18, s98 -; GFX9-NEXT: v_mov_b32_e32 v15, s97 -; GFX9-NEXT: v_mov_b32_e32 v59, s85 -; GFX9-NEXT: v_mov_b32_e32 v16, s87 -; GFX9-NEXT: v_mov_b32_e32 v13, s86 -; GFX9-NEXT: v_mov_b32_e32 v61, s82 -; GFX9-NEXT: v_mov_b32_e32 v14, s84 -; GFX9-NEXT: v_mov_b32_e32 v7, s83 -; GFX9-NEXT: v_mov_b32_e32 v28, s80 -; GFX9-NEXT: v_mov_b32_e32 v8, s81 -; GFX9-NEXT: v_mov_b32_e32 v1, s78 -; GFX9-NEXT: v_mov_b32_e32 v2, s88 -; GFX9-NEXT: v_mov_b32_e32 v3, s90 -; GFX9-NEXT: v_mov_b32_e32 v4, s92 -; GFX9-NEXT: v_mov_b32_e32 v5, s94 -; GFX9-NEXT: v_mov_b32_e32 v9, s30 +; GFX9-NEXT: v_mov_b32_e32 v47, s97 +; GFX9-NEXT: v_mov_b32_e32 v58, s99 +; GFX9-NEXT: v_mov_b32_e32 v15, s98 +; GFX9-NEXT: v_mov_b32_e32 v57, s86 +; GFX9-NEXT: v_mov_b32_e32 v60, s96 +; GFX9-NEXT: v_mov_b32_e32 v9, s87 +; GFX9-NEXT: v_mov_b32_e32 v59, s83 +; GFX9-NEXT: v_mov_b32_e32 v28, s85 +; GFX9-NEXT: v_mov_b32_e32 v3, s84 +; GFX9-NEXT: v_mov_b32_e32 v61, s80 +; GFX9-NEXT: v_mov_b32_e32 v30, s82 +; GFX9-NEXT: v_mov_b32_e32 v1, s81 +; GFX9-NEXT: v_mov_b32_e32 v29, s71 +; GFX9-NEXT: v_mov_b32_e32 v5, s78 +; GFX9-NEXT: v_mov_b32_e32 v6, s88 +; GFX9-NEXT: v_mov_b32_e32 v7, s90 +; GFX9-NEXT: v_mov_b32_e32 v11, s92 +; GFX9-NEXT: v_mov_b32_e32 v12, s94 +; GFX9-NEXT: v_mov_b32_e32 v13, s30 ; GFX9-NEXT: v_readlane_b32 s11, v62, 10 ; GFX9-NEXT: v_readlane_b32 s12, v62, 11 ; GFX9-NEXT: v_readlane_b32 s13, v62, 12 @@ -167408,7 +167532,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: v_readlane_b32 s10, v62, 19 ; GFX9-NEXT: v_readlane_b32 s41, v62, 20 ; GFX9-NEXT: v_readlane_b32 s43, v62, 21 -; GFX9-NEXT: v_readlane_b32 s45, v62, 22 +; GFX9-NEXT: v_readlane_b32 s47, v62, 22 ; GFX9-NEXT: v_readlane_b32 s75, v62, 23 ; GFX9-NEXT: v_readlane_b32 s79, v62, 24 ; GFX9-NEXT: v_readlane_b32 s74, v62, 25 @@ -167433,7 +167557,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: v_readlane_b32 vcc_hi, v62, 44 ; GFX9-NEXT: v_readlane_b32 s56, v62, 45 ; GFX9-NEXT: v_readlane_b32 s30, v62, 46 -; GFX9-NEXT: v_readlane_b32 s47, v62, 47 +; GFX9-NEXT: v_readlane_b32 s45, v62, 47 ; GFX9-NEXT: v_readlane_b32 s8, v62, 48 ; GFX9-NEXT: v_readlane_b32 s7, v62, 49 ; GFX9-NEXT: v_readlane_b32 s35, v62, 9 @@ -167446,14 +167570,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_lshl_b32 s7, s7, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s7, s8, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s44, 8 +; GFX9-NEXT: s_lshl_b32 s8, s46, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: v_mov_b32_e32 v31, s6 ; GFX9-NEXT: s_and_b32 s6, s17, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s47, 8 +; GFX9-NEXT: s_lshl_b32 s7, s45, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s7, s11, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s30, 8 @@ -167461,8 +167585,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen -; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen +; GFX9-NEXT: v_mov_b32_e32 v31, s6 ; GFX9-NEXT: s_and_b32 s6, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s56, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s7 @@ -167472,8 +167596,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_mov_b32_e32 v31, s6 ; GFX9-NEXT: s_and_b32 s6, s19, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s57, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s7 @@ -167483,8 +167607,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_mov_b32_e32 v31, s6 ; GFX9-NEXT: s_and_b32 s6, s20, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s58, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s7 @@ -167494,8 +167618,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_mov_b32_e32 v31, s6 ; GFX9-NEXT: s_and_b32 s6, s21, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s59, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s7 @@ -167505,8 +167629,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_mov_b32_e32 v31, s6 ; GFX9-NEXT: s_and_b32 s6, s22, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s60, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s7 @@ -167516,8 +167640,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_mov_b32_e32 v31, s6 ; GFX9-NEXT: s_and_b32 s6, s23, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s61, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s7 @@ -167527,8 +167651,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_mov_b32_e32 v31, s6 ; GFX9-NEXT: s_and_b32 s6, s24, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s62, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s7 @@ -167538,8 +167662,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_mov_b32_e32 v31, s6 ; GFX9-NEXT: s_and_b32 s6, s25, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s63, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s7 @@ -167549,8 +167673,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_mov_b32_e32 v31, s6 ; GFX9-NEXT: s_and_b32 s6, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s72, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s7 @@ -167560,8 +167684,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: v_mov_b32_e32 v31, s6 ; GFX9-NEXT: s_and_b32 s6, s27, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s73, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s7 @@ -167571,8 +167695,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: v_mov_b32_e32 v31, s6 ; GFX9-NEXT: s_and_b32 s6, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s74, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s7 @@ -167582,19 +167706,19 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_mov_b32_e32 v31, s6 ; GFX9-NEXT: s_and_b32 s6, s29, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s75, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s7, s77, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s45, 8 +; GFX9-NEXT: s_lshl_b32 s8, s47, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_mov_b32_e32 v31, s6 ; GFX9-NEXT: s_and_b32 s4, s4, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s43, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s6 @@ -167604,8 +167728,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: v_mov_b32_e32 v30, s4 +; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_mov_b32_e32 v31, s4 ; GFX9-NEXT: s_and_b32 s4, s5, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s10, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 @@ -167615,23 +167739,13 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s5, s5, 16 ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: v_mov_b32_e32 v30, s4 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX9-NEXT: v_or_b32_sdwa v11, v29, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX9-NEXT: v_or_b32_sdwa v5, v58, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GFX9-NEXT: v_or_b32_sdwa v4, v56, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: v_mov_b32_e32 v31, s4 +; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v30, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_readlane_b32 s99, v63, 35 ; GFX9-NEXT: v_readlane_b32 s98, v63, 34 ; GFX9-NEXT: v_readlane_b32 s97, v63, 33 @@ -167669,93 +167783,103 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: v_readlane_b32 s31, v63, 1 ; GFX9-NEXT: v_readlane_b32 s30, v63, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v7, v30, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v8 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v28 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v8, v11, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v10 -; GFX9-NEXT: v_or_b32_sdwa v7, v33, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v27, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v14 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v61 -; GFX9-NEXT: v_or_b32_sdwa v7, v31, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v8, v10, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:76 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v9 -; GFX9-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v60, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:80 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v16 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v59 -; GFX9-NEXT: v_or_b32_sdwa v7, v34, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v32, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:84 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v17 -; GFX9-NEXT: v_or_b32_sdwa v7, v39, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:88 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v18 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v57 -; GFX9-NEXT: v_or_b32_sdwa v5, v37, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:92 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v19 -; GFX9-NEXT: v_or_b32_sdwa v5, v50, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:96 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v20 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v47 -; GFX9-NEXT: v_or_b32_sdwa v4, v48, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v38, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:100 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v21 -; GFX9-NEXT: v_or_b32_sdwa v4, v53, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:104 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v22 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v45 -; GFX9-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v49, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:108 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v23 -; GFX9-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:112 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v24 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v43 -; GFX9-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:116 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v25 -; GFX9-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v12 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v12 +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v57 +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v20 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v47 +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v22 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v7 +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v45 +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v6 ; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v27 +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v26 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v5 +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v14 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v8 +; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload @@ -167806,20 +167930,20 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: v_readfirstlane_b32 s59, v8 ; GFX11-NEXT: v_writelane_b32 v40, s35, 3 ; GFX11-NEXT: v_writelane_b32 v41, s99, 3 -; GFX11-NEXT: v_readfirstlane_b32 s56, v9 -; GFX11-NEXT: v_readfirstlane_b32 s57, v10 -; GFX11-NEXT: v_readfirstlane_b32 s46, v11 +; GFX11-NEXT: v_readfirstlane_b32 s8, v9 +; GFX11-NEXT: v_readfirstlane_b32 s9, v10 +; GFX11-NEXT: v_readfirstlane_b32 s6, v11 ; GFX11-NEXT: v_writelane_b32 v40, s36, 4 ; GFX11-NEXT: v_writelane_b32 v41, s100, 4 -; GFX11-NEXT: v_readfirstlane_b32 s47, v12 -; GFX11-NEXT: v_readfirstlane_b32 s44, v13 -; GFX11-NEXT: v_readfirstlane_b32 s45, v14 +; GFX11-NEXT: v_readfirstlane_b32 s7, v12 +; GFX11-NEXT: v_readfirstlane_b32 s4, v13 +; GFX11-NEXT: v_readfirstlane_b32 s5, v14 ; GFX11-NEXT: v_writelane_b32 v40, s37, 5 ; GFX11-NEXT: v_writelane_b32 v41, s101, 5 ; GFX11-NEXT: s_mov_b32 vcc_hi, 0 -; GFX11-NEXT: s_and_b32 s4, vcc_lo, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane +; GFX11-NEXT: s_and_b32 s10, vcc_lo, exec_lo ; GFX11-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane +; GFX11-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane ; GFX11-NEXT: v_writelane_b32 v40, s38, 6 ; GFX11-NEXT: v_writelane_b32 v41, s102, 6 ; GFX11-NEXT: v_writelane_b32 v40, s39, 7 @@ -167851,522 +167975,511 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: v_writelane_b32 v40, s87, 31 ; GFX11-NEXT: s_cbranch_scc0 .LBB91_3 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s4, s27, 24 -; GFX11-NEXT: s_lshr_b64 s[12:13], s[26:27], 24 -; GFX11-NEXT: v_writelane_b32 v43, s4, 15 -; GFX11-NEXT: s_lshr_b32 s4, s27, 16 -; GFX11-NEXT: s_lshr_b32 s99, s2, 16 -; GFX11-NEXT: s_lshr_b32 s100, s2, 8 -; GFX11-NEXT: s_lshr_b32 s101, s1, 24 -; GFX11-NEXT: v_writelane_b32 v43, s4, 14 -; GFX11-NEXT: s_lshr_b32 s4, s27, 8 -; GFX11-NEXT: s_lshr_b32 s11, s1, 16 -; GFX11-NEXT: s_lshr_b32 s102, s1, 8 -; GFX11-NEXT: s_lshr_b32 s103, s0, 16 -; GFX11-NEXT: v_writelane_b32 v43, s4, 16 -; GFX11-NEXT: s_lshr_b32 s4, s26, 16 +; GFX11-NEXT: s_lshr_b32 s10, s27, 24 +; GFX11-NEXT: s_lshr_b64 s[42:43], s[22:23], 24 +; GFX11-NEXT: v_writelane_b32 v42, s10, 13 +; GFX11-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-NEXT: s_lshr_b64 s[46:47], s[16:17], 24 +; GFX11-NEXT: s_lshr_b32 s41, s1, 16 +; GFX11-NEXT: s_lshr_b32 s99, s1, 8 +; GFX11-NEXT: v_writelane_b32 v42, s10, 12 +; GFX11-NEXT: s_lshr_b32 s10, s27, 8 +; GFX11-NEXT: s_lshr_b32 s44, s0, 16 ; GFX11-NEXT: s_lshr_b32 s104, s0, 8 -; GFX11-NEXT: s_lshr_b32 s85, s45, 24 -; GFX11-NEXT: s_lshr_b32 s10, s45, 16 -; GFX11-NEXT: v_writelane_b32 v43, s4, 17 -; GFX11-NEXT: s_lshr_b32 s4, s26, 8 -; GFX11-NEXT: s_lshr_b32 s5, s45, 8 -; GFX11-NEXT: s_lshr_b32 s87, s44, 16 -; GFX11-NEXT: s_lshr_b32 s86, s44, 8 -; GFX11-NEXT: v_writelane_b32 v43, s4, 18 -; GFX11-NEXT: s_lshr_b32 s4, s25, 24 -; GFX11-NEXT: s_lshr_b32 s81, s47, 24 -; GFX11-NEXT: s_lshr_b32 s98, s47, 16 -; GFX11-NEXT: s_lshr_b32 s84, s47, 8 -; GFX11-NEXT: v_writelane_b32 v43, s4, 19 -; GFX11-NEXT: s_lshr_b32 s4, s25, 16 -; GFX11-NEXT: s_lshr_b32 s48, s46, 8 -; GFX11-NEXT: s_lshr_b32 s70, s57, 24 -; GFX11-NEXT: s_lshr_b32 s97, s57, 16 -; GFX11-NEXT: v_writelane_b32 v43, s4, 13 -; GFX11-NEXT: s_lshr_b32 s4, s25, 8 -; GFX11-NEXT: s_lshr_b32 s80, s57, 8 -; GFX11-NEXT: s_lshr_b32 s83, s56, 16 -; GFX11-NEXT: s_lshr_b32 s82, s56, 8 -; GFX11-NEXT: v_writelane_b32 v43, s4, 20 -; GFX11-NEXT: s_lshr_b32 s4, s24, 16 -; GFX11-NEXT: s_lshr_b32 s66, s59, 24 -; GFX11-NEXT: s_lshr_b32 s9, s59, 16 -; GFX11-NEXT: s_lshr_b32 s69, s59, 8 -; GFX11-NEXT: v_writelane_b32 v43, s4, 21 -; GFX11-NEXT: s_lshr_b32 s4, s24, 8 +; GFX11-NEXT: s_lshr_b32 s84, s5, 24 +; GFX11-NEXT: v_writelane_b32 v42, s10, 14 +; GFX11-NEXT: s_lshr_b32 s10, s26, 16 +; GFX11-NEXT: s_lshr_b32 s40, s5, 16 +; GFX11-NEXT: s_lshr_b32 s11, s5, 8 +; GFX11-NEXT: s_lshr_b32 s87, s4, 16 +; GFX11-NEXT: v_writelane_b32 v42, s10, 15 +; GFX11-NEXT: s_lshr_b32 s10, s26, 8 +; GFX11-NEXT: s_lshr_b32 s86, s4, 8 +; GFX11-NEXT: s_lshr_b32 s48, s7, 24 +; GFX11-NEXT: s_lshr_b32 s98, s7, 16 +; GFX11-NEXT: v_writelane_b32 v42, s10, 16 +; GFX11-NEXT: s_lshr_b32 s10, s25, 24 +; GFX11-NEXT: s_lshr_b32 s83, s7, 8 +; GFX11-NEXT: s_lshr_b32 s85, s6, 8 +; GFX11-NEXT: s_lshr_b32 s69, s9, 24 +; GFX11-NEXT: v_writelane_b32 v42, s10, 17 +; GFX11-NEXT: s_lshr_b32 s10, s25, 16 +; GFX11-NEXT: s_lshr_b32 s97, s9, 16 +; GFX11-NEXT: s_lshr_b32 s80, s9, 8 +; GFX11-NEXT: s_lshr_b32 s82, s8, 16 +; GFX11-NEXT: v_writelane_b32 v42, s10, 11 +; GFX11-NEXT: s_lshr_b32 s10, s25, 8 +; GFX11-NEXT: s_lshr_b32 s81, s8, 8 +; GFX11-NEXT: s_lshr_b32 s65, s59, 24 +; GFX11-NEXT: s_lshr_b32 s15, s59, 16 +; GFX11-NEXT: v_writelane_b32 v42, s10, 18 +; GFX11-NEXT: s_lshr_b32 s10, s24, 16 +; GFX11-NEXT: s_lshr_b32 s68, s59, 8 ; GFX11-NEXT: s_lshr_b32 s71, s58, 16 -; GFX11-NEXT: s_lshr_b32 s39, s58, 8 -; GFX11-NEXT: s_lshr_b32 s55, s61, 24 -; GFX11-NEXT: v_writelane_b32 v43, s4, 22 -; GFX11-NEXT: s_lshr_b32 s4, s23, 24 -; GFX11-NEXT: s_lshr_b32 s8, s61, 16 -; GFX11-NEXT: s_lshr_b32 s65, s61, 8 -; GFX11-NEXT: s_lshr_b32 s68, s60, 16 -; GFX11-NEXT: v_writelane_b32 v43, s4, 23 -; GFX11-NEXT: s_lshr_b32 s4, s23, 16 -; GFX11-NEXT: s_lshr_b32 s67, s60, 8 -; GFX11-NEXT: s_lshr_b32 s51, s63, 24 +; GFX11-NEXT: s_lshr_b32 s70, s58, 8 +; GFX11-NEXT: v_writelane_b32 v42, s10, 19 +; GFX11-NEXT: s_lshr_b32 s10, s24, 8 +; GFX11-NEXT: s_lshr_b32 s54, s61, 24 +; GFX11-NEXT: s_lshr_b32 s14, s61, 16 +; GFX11-NEXT: s_lshr_b32 s39, s61, 8 +; GFX11-NEXT: v_writelane_b32 v42, s10, 20 +; GFX11-NEXT: s_lshr_b32 s10, s23, 24 +; GFX11-NEXT: s_lshr_b32 s67, s60, 16 +; GFX11-NEXT: s_lshr_b32 s66, s60, 8 +; GFX11-NEXT: s_lshr_b32 s38, s63, 24 +; GFX11-NEXT: v_writelane_b32 v42, s10, 21 +; GFX11-NEXT: s_lshr_b32 s10, s23, 16 ; GFX11-NEXT: s_lshr_b32 s96, s63, 16 -; GFX11-NEXT: v_writelane_b32 v43, s4, 12 -; GFX11-NEXT: s_lshr_b32 s4, s23, 8 -; GFX11-NEXT: s_lshr_b32 s54, s63, 8 -; GFX11-NEXT: s_lshr_b32 s38, s62, 16 -; GFX11-NEXT: s_lshr_b32 s64, s62, 8 -; GFX11-NEXT: v_writelane_b32 v43, s4, 24 -; GFX11-NEXT: s_lshr_b32 s4, s22, 16 +; GFX11-NEXT: s_lshr_b32 s53, s63, 8 +; GFX11-NEXT: s_lshr_b32 s64, s62, 16 +; GFX11-NEXT: v_writelane_b32 v42, s10, 10 +; GFX11-NEXT: s_lshr_b32 s10, s23, 8 +; GFX11-NEXT: s_lshr_b32 s55, s62, 8 ; GFX11-NEXT: s_lshr_b32 s36, s73, 24 -; GFX11-NEXT: s_lshr_b32 s7, s73, 16 +; GFX11-NEXT: s_lshr_b32 s13, s73, 16 +; GFX11-NEXT: v_writelane_b32 v42, s10, 22 +; GFX11-NEXT: s_lshr_b32 s10, s22, 16 ; GFX11-NEXT: s_lshr_b32 s50, s73, 8 -; GFX11-NEXT: v_writelane_b32 v43, s4, 25 -; GFX11-NEXT: s_lshr_b32 s4, s22, 8 -; GFX11-NEXT: s_lshr_b32 s53, s72, 16 -; GFX11-NEXT: s_lshr_b32 s52, s72, 8 +; GFX11-NEXT: s_lshr_b32 s52, s72, 16 +; GFX11-NEXT: s_lshr_b32 s51, s72, 8 +; GFX11-NEXT: v_writelane_b32 v42, s10, 23 +; GFX11-NEXT: s_lshr_b32 s10, s22, 8 ; GFX11-NEXT: s_lshr_b32 s34, s29, 24 -; GFX11-NEXT: v_writelane_b32 v43, s4, 26 -; GFX11-NEXT: s_lshr_b32 s4, s21, 24 -; GFX11-NEXT: s_lshr_b32 s6, s29, 16 +; GFX11-NEXT: s_lshr_b32 s12, s29, 16 ; GFX11-NEXT: s_lshr_b32 s35, s29, 8 -; GFX11-NEXT: s_lshr_b32 s37, s28, 16 -; GFX11-NEXT: v_writelane_b32 v43, s4, 27 -; GFX11-NEXT: s_lshr_b32 s4, s21, 16 -; GFX11-NEXT: s_lshr_b32 s49, s28, 8 -; GFX11-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 -; GFX11-NEXT: s_lshr_b64 s[40:41], s[2:3], 24 -; GFX11-NEXT: v_writelane_b32 v43, s4, 11 -; GFX11-NEXT: s_lshr_b32 s4, s21, 8 -; GFX11-NEXT: s_lshr_b64 s[42:43], s[0:1], 24 -; GFX11-NEXT: s_lshr_b64 s[74:75], s[44:45], 24 -; GFX11-NEXT: s_lshr_b64 s[76:77], s[46:47], 24 -; GFX11-NEXT: v_writelane_b32 v43, s4, 28 -; GFX11-NEXT: s_lshr_b32 s4, s20, 16 -; GFX11-NEXT: s_lshr_b64 s[78:79], s[56:57], 24 +; GFX11-NEXT: v_writelane_b32 v42, s10, 24 +; GFX11-NEXT: s_lshr_b32 s10, s21, 24 +; GFX11-NEXT: s_lshr_b32 s49, s28, 16 +; GFX11-NEXT: s_lshr_b32 s37, s28, 8 +; GFX11-NEXT: s_lshr_b64 s[100:101], s[26:27], 24 +; GFX11-NEXT: v_writelane_b32 v42, s10, 25 +; GFX11-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-NEXT: s_lshr_b64 s[102:103], s[24:25], 24 +; GFX11-NEXT: s_lshr_b64 s[56:57], s[0:1], 24 +; GFX11-NEXT: s_lshr_b64 s[74:75], s[4:5], 24 +; GFX11-NEXT: v_writelane_b32 v42, s10, 9 +; GFX11-NEXT: s_lshr_b32 s10, s21, 8 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[6:7], 24 +; GFX11-NEXT: s_lshr_b64 s[78:79], s[8:9], 24 ; GFX11-NEXT: s_lshr_b64 s[88:89], s[58:59], 24 +; GFX11-NEXT: v_writelane_b32 v42, s10, 26 +; GFX11-NEXT: s_lshr_b32 s10, s20, 16 ; GFX11-NEXT: s_lshr_b64 s[90:91], s[60:61], 24 -; GFX11-NEXT: v_writelane_b32 v43, s4, 29 -; GFX11-NEXT: s_lshr_b32 s4, s20, 8 ; GFX11-NEXT: s_lshr_b64 s[92:93], s[62:63], 24 ; GFX11-NEXT: s_lshr_b64 s[94:95], s[72:73], 24 +; GFX11-NEXT: v_writelane_b32 v42, s10, 27 +; GFX11-NEXT: s_lshr_b32 s10, s20, 8 ; GFX11-NEXT: s_lshr_b64 s[30:31], s[28:29], 24 -; GFX11-NEXT: v_writelane_b32 v43, s4, 30 -; GFX11-NEXT: s_lshr_b32 s4, s19, 24 +; GFX11-NEXT: v_writelane_b32 v42, s10, 28 +; GFX11-NEXT: s_lshr_b32 s10, s19, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v43, s4, 31 -; GFX11-NEXT: s_lshr_b32 s4, s19, 16 -; GFX11-NEXT: v_writelane_b32 v43, s4, 10 -; GFX11-NEXT: s_lshr_b32 s4, s19, 8 +; GFX11-NEXT: v_writelane_b32 v42, s10, 29 +; GFX11-NEXT: s_lshr_b32 s10, s19, 16 +; GFX11-NEXT: v_writelane_b32 v42, s10, 8 +; GFX11-NEXT: s_lshr_b32 s10, s19, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v42, s4, 0 -; GFX11-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-NEXT: v_writelane_b32 v42, s4, 1 -; GFX11-NEXT: s_lshr_b32 s4, s18, 8 +; GFX11-NEXT: v_writelane_b32 v42, s10, 30 +; GFX11-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-NEXT: v_writelane_b32 v42, s10, 31 +; GFX11-NEXT: s_lshr_b32 s10, s18, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v42, s4, 2 -; GFX11-NEXT: s_lshr_b32 s4, s17, 24 -; GFX11-NEXT: v_writelane_b32 v42, s4, 3 -; GFX11-NEXT: s_lshr_b32 s4, s17, 16 +; GFX11-NEXT: v_writelane_b32 v43, s10, 0 +; GFX11-NEXT: s_lshr_b32 s10, s17, 24 +; GFX11-NEXT: v_writelane_b32 v43, s10, 1 +; GFX11-NEXT: s_lshr_b32 s10, s17, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v43, s4, 9 -; GFX11-NEXT: s_lshr_b32 s4, s17, 8 -; GFX11-NEXT: v_writelane_b32 v42, s4, 4 -; GFX11-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-NEXT: v_writelane_b32 v42, s10, 7 +; GFX11-NEXT: s_lshr_b32 s10, s17, 8 +; GFX11-NEXT: v_writelane_b32 v43, s10, 2 +; GFX11-NEXT: s_lshr_b32 s10, s16, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v42, s4, 5 -; GFX11-NEXT: s_lshr_b32 s4, s16, 8 -; GFX11-NEXT: v_writelane_b32 v42, s4, 6 -; GFX11-NEXT: s_lshr_b32 s4, s3, 24 +; GFX11-NEXT: v_writelane_b32 v43, s10, 3 +; GFX11-NEXT: s_lshr_b32 s10, s16, 8 +; GFX11-NEXT: v_writelane_b32 v43, s10, 4 +; GFX11-NEXT: s_lshr_b32 s10, s3, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v42, s4, 7 -; GFX11-NEXT: s_lshr_b32 s4, s3, 16 -; GFX11-NEXT: v_writelane_b32 v43, s4, 8 -; GFX11-NEXT: s_lshr_b32 s4, s3, 8 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v42, s4, 8 -; GFX11-NEXT: s_lshr_b32 s4, s46, 16 -; GFX11-NEXT: v_writelane_b32 v43, s12, 6 -; GFX11-NEXT: v_writelane_b32 v43, s13, 7 -; GFX11-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 -; GFX11-NEXT: v_writelane_b32 v43, s12, 4 -; GFX11-NEXT: v_writelane_b32 v43, s13, 5 -; GFX11-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v43, s12, 2 -; GFX11-NEXT: v_writelane_b32 v43, s13, 3 -; GFX11-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 -; GFX11-NEXT: v_writelane_b32 v43, s12, 0 -; GFX11-NEXT: v_writelane_b32 v43, s13, 1 -; GFX11-NEXT: s_lshr_b64 s[12:13], s[18:19], 24 +; GFX11-NEXT: v_writelane_b32 v43, s10, 5 +; GFX11-NEXT: s_lshr_b32 s10, s3, 16 +; GFX11-NEXT: v_writelane_b32 v42, s10, 6 +; GFX11-NEXT: s_lshr_b32 s10, s3, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v43, s10, 7 +; GFX11-NEXT: s_lshr_b32 s10, s2, 16 +; GFX11-NEXT: v_writelane_b32 v42, s42, 4 +; GFX11-NEXT: v_writelane_b32 v43, s10, 6 +; GFX11-NEXT: s_lshr_b32 s10, s2, 8 +; GFX11-NEXT: v_writelane_b32 v42, s43, 5 +; GFX11-NEXT: s_lshr_b64 s[42:43], s[20:21], 24 +; GFX11-NEXT: v_writelane_b32 v43, s10, 8 +; GFX11-NEXT: s_lshr_b32 s10, s1, 24 +; GFX11-NEXT: v_writelane_b32 v42, s42, 2 +; GFX11-NEXT: v_writelane_b32 v43, s10, 9 +; GFX11-NEXT: s_lshr_b32 s10, s6, 16 +; GFX11-NEXT: v_writelane_b32 v42, s43, 3 +; GFX11-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; GFX11-NEXT: v_writelane_b32 v42, s46, 0 +; GFX11-NEXT: v_writelane_b32 v42, s47, 1 +; GFX11-NEXT: s_lshr_b64 s[46:47], s[2:3], 24 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi ; GFX11-NEXT: s_cbranch_vccnz .LBB91_4 ; GFX11-NEXT: .LBB91_2: ; %cmp.true -; GFX11-NEXT: s_and_b32 s4, s29, 0xffff0000 -; GFX11-NEXT: s_and_b32 s14, s47, 0xffff0000 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 -; GFX11-NEXT: s_and_b32 s4, s1, 0xffff0000 -; GFX11-NEXT: s_lshl_b32 s15, s47, 16 -; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 -; GFX11-NEXT: s_lshl_b32 s6, s29, 16 +; GFX11-NEXT: s_and_b32 s10, s29, 0xffff0000 +; GFX11-NEXT: s_and_b32 s13, s9, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s10 +; GFX11-NEXT: s_lshl_b32 s10, s9, 16 +; GFX11-NEXT: s_and_b32 s9, s1, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s11, s29, 16 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s9 +; GFX11-NEXT: s_and_b32 s76, s28, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s77, s28, 16 ; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s6 -; GFX11-NEXT: s_and_b32 s8, s45, 0xffff0000 -; GFX11-NEXT: v_readfirstlane_b32 s47, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_and_b32 s28, s58, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s29, s58, 16 +; GFX11-NEXT: v_readfirstlane_b32 s58, v6 +; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s11 ; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1 -; GFX11-NEXT: s_lshl_b32 s7, s45, 16 -; GFX11-NEXT: s_and_b32 s78, s28, 0xffff0000 -; GFX11-NEXT: s_bfe_u32 s6, s47, 0x10010 -; GFX11-NEXT: s_lshl_b32 s79, s28, 16 -; GFX11-NEXT: s_add_i32 s45, s6, s47 -; GFX11-NEXT: s_and_b32 s5, s73, 0xffff0000 -; GFX11-NEXT: s_lshl_b32 s77, s73, 16 -; GFX11-NEXT: s_and_b32 s75, s72, 0xffff0000 -; GFX11-NEXT: s_lshl_b32 s76, s72, 16 -; GFX11-NEXT: s_and_b32 s11, s63, 0xffff0000 -; GFX11-NEXT: s_lshl_b32 s74, s63, 16 -; GFX11-NEXT: s_and_b32 s72, s62, 0xffff0000 -; GFX11-NEXT: s_lshl_b32 s73, s62, 16 -; GFX11-NEXT: s_and_b32 s63, s61, 0xffff0000 -; GFX11-NEXT: s_lshl_b32 s62, s61, 16 -; GFX11-NEXT: s_and_b32 s61, s60, 0xffff0000 -; GFX11-NEXT: s_lshl_b32 s60, s60, 16 +; GFX11-NEXT: s_and_b32 s12, s6, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s9, s6, 16 +; GFX11-NEXT: s_bfe_u32 s6, s58, 0x10010 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-NEXT: s_and_b32 s41, s59, 0xffff0000 ; GFX11-NEXT: s_lshl_b32 s40, s59, 16 -; GFX11-NEXT: s_and_b32 s28, s58, 0xffff0000 -; GFX11-NEXT: s_lshl_b32 s29, s58, 16 -; GFX11-NEXT: s_and_b32 s13, s57, 0xffff0000 -; GFX11-NEXT: s_lshl_b32 s10, s57, 16 -; GFX11-NEXT: s_and_b32 s42, s56, 0xffff0000 -; GFX11-NEXT: s_lshl_b32 s43, s56, 16 -; GFX11-NEXT: s_and_b32 s12, s46, 0xffff0000 -; GFX11-NEXT: s_lshl_b32 s9, s46, 16 -; GFX11-NEXT: s_and_b32 s4, s44, 0xffff0000 -; GFX11-NEXT: s_lshl_b32 s6, s44, 16 -; GFX11-NEXT: s_addk_i32 s45, 0x7fff -; GFX11-NEXT: s_bitset1_b32 s47, 22 +; GFX11-NEXT: s_add_i32 s59, s6, s58 +; GFX11-NEXT: s_and_b32 s74, s73, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s75, s73, 16 +; GFX11-NEXT: s_and_b32 s73, s72, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s72, s72, 16 +; GFX11-NEXT: s_and_b32 s11, s63, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s63, s63, 16 +; GFX11-NEXT: s_and_b32 s56, s62, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s57, s62, 16 +; GFX11-NEXT: s_and_b32 s47, s61, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s45, s61, 16 +; GFX11-NEXT: s_and_b32 s44, s60, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s46, s60, 16 +; GFX11-NEXT: s_and_b32 s42, s8, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s43, s8, 16 +; GFX11-NEXT: s_and_b32 s14, s7, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s15, s7, 16 +; GFX11-NEXT: s_and_b32 s8, s5, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s7, s5, 16 +; GFX11-NEXT: s_and_b32 s5, s4, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s6, s4, 16 +; GFX11-NEXT: s_addk_i32 s59, 0x7fff +; GFX11-NEXT: s_bitset1_b32 s58, 22 ; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 -; GFX11-NEXT: s_and_b32 s44, vcc_lo, exec_lo +; GFX11-NEXT: s_and_b32 s4, vcc_lo, exec_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_cselect_b32 s44, s47, s45 +; GFX11-NEXT: s_cselect_b32 s4, s58, s59 ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 ; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v2 -; GFX11-NEXT: s_lshr_b32 s58, s44, 16 +; GFX11-NEXT: s_lshr_b32 s61, s4, 16 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo ; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s1 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s78 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s76 ; GFX11-NEXT: v_readfirstlane_b32 s1, v3 -; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s79 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s77 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: s_bfe_u32 s45, s1, 0x10010 +; GFX11-NEXT: s_bfe_u32 s58, s1, 0x10010 ; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 -; GFX11-NEXT: s_add_i32 s45, s45, s1 +; GFX11-NEXT: s_add_i32 s58, s58, s1 ; GFX11-NEXT: s_bitset1_b32 s1, 22 -; GFX11-NEXT: s_addk_i32 s45, 0x7fff -; GFX11-NEXT: s_and_b32 s44, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s1, s1, s45 -; GFX11-NEXT: s_and_b32 s44, s0, 0xffff0000 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s44 +; GFX11-NEXT: s_addk_i32 s58, 0x7fff +; GFX11-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s1, s1, s58 +; GFX11-NEXT: s_and_b32 s4, s0, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s4 ; GFX11-NEXT: v_bfe_u32 v5, v7, 16, 1 ; GFX11-NEXT: v_add_nc_u32_e32 v3, v4, v6 ; GFX11-NEXT: s_lshr_b32 s1, s1, 16 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7 -; GFX11-NEXT: v_readfirstlane_b32 s44, v2 +; GFX11-NEXT: v_readfirstlane_b32 s4, v2 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 -; GFX11-NEXT: s_bfe_u32 s45, s44, 0x10010 +; GFX11-NEXT: s_bfe_u32 s58, s4, 0x10010 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_add_i32 s45, s45, s44 -; GFX11-NEXT: s_bitset1_b32 s44, 22 -; GFX11-NEXT: s_addk_i32 s45, 0x7fff -; GFX11-NEXT: s_and_b32 s46, vcc_lo, exec_lo +; GFX11-NEXT: s_add_i32 s58, s58, s4 +; GFX11-NEXT: s_bitset1_b32 s4, 22 +; GFX11-NEXT: s_addk_i32 s58, 0x7fff +; GFX11-NEXT: s_and_b32 s59, vcc_lo, exec_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-NEXT: v_add_nc_u32_e32 v1, v5, v7 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v6 -; GFX11-NEXT: s_cselect_b32 s44, s44, s45 +; GFX11-NEXT: s_cselect_b32 s4, s4, s58 ; GFX11-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_lshr_b32 s4, s4, 16 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s75 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo ; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s0 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v21 -; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s5 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v22 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s74 ; GFX11-NEXT: v_readfirstlane_b32 s0, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s77 -; GFX11-NEXT: s_bfe_u32 s5, s0, 0x10010 -; GFX11-NEXT: v_lshl_or_b32 v7, v22, 16, v4 -; GFX11-NEXT: s_add_i32 s45, s5, s0 -; GFX11-NEXT: s_lshr_b32 s5, s44, 16 -; GFX11-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: s_bfe_u32 s58, s0, 0x10010 +; GFX11-NEXT: v_lshl_or_b32 v2, v23, 16, v4 +; GFX11-NEXT: s_add_i32 s58, s58, s0 ; GFX11-NEXT: s_bitset1_b32 s0, 22 -; GFX11-NEXT: s_and_b32 s44, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s0, s0, s45 -; GFX11-NEXT: s_and_b32 s44, s3, 0xffff0000 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s44 -; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1 +; GFX11-NEXT: s_addk_i32 s58, 0x7fff +; GFX11-NEXT: s_and_b32 s59, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s0, s0, s58 +; GFX11-NEXT: s_and_b32 s58, s3, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s58 ; GFX11-NEXT: v_bfe_u32 v1, v5, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v8, v6 ; GFX11-NEXT: s_lshr_b32 s0, s0, 16 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v23 -; GFX11-NEXT: v_readfirstlane_b32 s44, v9 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v24 +; GFX11-NEXT: v_readfirstlane_b32 s58, v9 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v5 -; GFX11-NEXT: v_lshl_or_b32 v6, v2, 16, v3 -; GFX11-NEXT: s_bfe_u32 s45, s44, 0x10010 -; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v5 -; GFX11-NEXT: s_add_i32 s45, s45, s44 -; GFX11-NEXT: s_bitset1_b32 s44, 22 -; GFX11-NEXT: s_addk_i32 s45, 0x7fff -; GFX11-NEXT: s_and_b32 s46, vcc_lo, exec_lo -; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 -; GFX11-NEXT: s_cselect_b32 s44, s44, s45 +; GFX11-NEXT: v_add_nc_u32_e32 v10, v1, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_lshl_or_b32 v1, v7, 16, v3 +; GFX11-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX11-NEXT: s_add_i32 s59, s59, s58 +; GFX11-NEXT: s_bitset1_b32 s58, 22 +; GFX11-NEXT: s_addk_i32 s59, 0x7fff +; GFX11-NEXT: s_and_b32 s60, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s58, s58, s59 ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v10 ; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s3 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s76 -; GFX11-NEXT: s_lshr_b32 s59, s44, 16 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s75 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s72 +; GFX11-NEXT: s_lshr_b32 s62, s58, 16 ; GFX11-NEXT: v_readfirstlane_b32 s3, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-NEXT: v_bfe_u32 v8, v9, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v87, 24, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1 -; GFX11-NEXT: s_bfe_u32 s45, s3, 0x10010 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s73 +; GFX11-NEXT: s_bfe_u32 s59, s3, 0x10010 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-NEXT: s_add_i32 s45, s45, s3 +; GFX11-NEXT: s_add_i32 s59, s59, s3 ; GFX11-NEXT: s_bitset1_b32 s3, 22 -; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4 -; GFX11-NEXT: s_addk_i32 s45, 0x7fff -; GFX11-NEXT: s_and_b32 s44, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s3, s3, s45 -; GFX11-NEXT: s_and_b32 s44, s2, 0xffff0000 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s44 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v3, v8, v9 +; GFX11-NEXT: s_addk_i32 s59, 0x7fff +; GFX11-NEXT: v_add_nc_u32_e32 v5, v8, v9 +; GFX11-NEXT: s_and_b32 s58, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s3, s3, s59 +; GFX11-NEXT: s_and_b32 s58, s2, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s58 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v9 -; GFX11-NEXT: v_readfirstlane_b32 s44, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_readfirstlane_b32 s58, v3 ; GFX11-NEXT: s_lshr_b32 s3, s3, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v4 -; GFX11-NEXT: s_bfe_u32 s45, s44, 0x10010 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v24 -; GFX11-NEXT: s_add_i32 s45, s45, s44 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_addk_i32 s45, 0x7fff -; GFX11-NEXT: s_bitset1_b32 s44, 22 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s74 -; GFX11-NEXT: v_lshl_or_b32 v14, v25, 16, v5 -; GFX11-NEXT: s_and_b32 s46, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s44, s44, s45 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: s_add_i32 s59, s59, s58 +; GFX11-NEXT: s_bitset1_b32 s58, 22 +; GFX11-NEXT: s_addk_i32 s59, 0x7fff +; GFX11-NEXT: s_and_b32 s60, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s58, s58, s59 ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 ; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s2 -; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v85, 24, v14 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s63 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v5 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s11 ; GFX11-NEXT: v_readfirstlane_b32 s2, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v3 -; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s11 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v25 +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX11-NEXT: s_bfe_u32 s11, s2, 0x10010 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_add_i32 s45, s11, s2 -; GFX11-NEXT: s_lshr_b32 s11, s44, 16 -; GFX11-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: s_add_i32 s59, s11, s2 +; GFX11-NEXT: s_lshr_b32 s11, s58, 16 +; GFX11-NEXT: s_addk_i32 s59, 0x7fff ; GFX11-NEXT: s_bitset1_b32 s2, 22 -; GFX11-NEXT: s_and_b32 s44, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s2, s2, s45 -; GFX11-NEXT: s_and_b32 s44, s17, 0xffff0000 -; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v26 -; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s44 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1 +; GFX11-NEXT: s_and_b32 s58, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s2, s2, s59 +; GFX11-NEXT: s_and_b32 s58, s17, 0xffff0000 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v3 +; GFX11-NEXT: v_lshl_or_b32 v9, v26, 16, v7 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s58 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_lshl_or_b32 v8, v4, 16, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-NEXT: v_readfirstlane_b32 s58, v7 +; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1 ; GFX11-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-NEXT: v_lshl_or_b32 v13, v2, 16, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX11-NEXT: v_readfirstlane_b32 s44, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v3 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v13 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo -; GFX11-NEXT: s_bfe_u32 s45, s44, 0x10010 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_add_i32 s45, s45, s44 -; GFX11-NEXT: s_bitset1_b32 s44, 22 -; GFX11-NEXT: s_addk_i32 s45, 0x7fff -; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 -; GFX11-NEXT: s_and_b32 s46, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s44, s44, s45 +; GFX11-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc_lo +; GFX11-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_add_i32 s59, s59, s58 +; GFX11-NEXT: s_bitset1_b32 s58, 22 +; GFX11-NEXT: s_addk_i32 s59, 0x7fff +; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v5 +; GFX11-NEXT: s_and_b32 s60, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s58, s58, s59 ; GFX11-NEXT: s_lshl_b32 s17, s17, 16 -; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s73 -; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s17 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s72 -; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-NEXT: v_readfirstlane_b32 s17, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: s_lshr_b32 s72, s44, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v2 -; GFX11-NEXT: s_bfe_u32 s45, s17, 0x10010 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s17 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v11 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s57 ; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v27 -; GFX11-NEXT: s_add_i32 s45, s45, s17 +; GFX11-NEXT: v_readfirstlane_b32 s17, v6 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s56 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-NEXT: s_bfe_u32 s56, s17, 0x10010 +; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1 +; GFX11-NEXT: s_add_i32 s56, s56, s17 +; GFX11-NEXT: s_lshr_b32 s58, s58, 16 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff ; GFX11-NEXT: s_bitset1_b32 s17, 22 -; GFX11-NEXT: s_addk_i32 s45, 0x7fff -; GFX11-NEXT: s_and_b32 s44, vcc_lo, exec_lo -; GFX11-NEXT: v_lshl_or_b32 v16, v28, 16, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 -; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX11-NEXT: s_cselect_b32 s17, s17, s45 -; GFX11-NEXT: s_and_b32 s44, s16, 0xffff0000 +; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s17, s17, s56 +; GFX11-NEXT: s_and_b32 s56, s16, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v10, v3 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s56 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_lshl_or_b32 v13, v29, 16, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v7 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-NEXT: v_readfirstlane_b32 s56, v10 ; GFX11-NEXT: s_lshr_b32 s17, s17, 16 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s63 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 24, v16 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_nc_u32_e32 v2, v5, v3 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v29 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v4, v8, v1 -; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s44 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_readfirstlane_b32 s44, v8 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-NEXT: s_bfe_u32 s45, s44, 0x10010 -; GFX11-NEXT: s_add_i32 s45, s45, s44 -; GFX11-NEXT: s_bitset1_b32 s44, 22 -; GFX11-NEXT: s_addk_i32 s45, 0x7fff -; GFX11-NEXT: s_and_b32 s46, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s44, s44, s45 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s47 +; GFX11-NEXT: s_bfe_u32 s47, s56, 0x10010 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: s_add_i32 s47, s47, s56 +; GFX11-NEXT: s_bitset1_b32 s56, 22 +; GFX11-NEXT: s_addk_i32 s47, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s47, s56, s47 ; GFX11-NEXT: s_lshl_b32 s16, s16, 16 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s16 -; GFX11-NEXT: s_lshr_b32 s46, s44, 16 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s16, v8 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s62 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: s_bfe_u32 s45, s16, 0x10010 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_add_i32 s45, s45, s16 -; GFX11-NEXT: s_bitset1_b32 s16, 22 -; GFX11-NEXT: s_addk_i32 s45, 0x7fff -; GFX11-NEXT: s_and_b32 s44, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s16, s16, s45 -; GFX11-NEXT: s_and_b32 s44, s19, 0xffff0000 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s44 -; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v5 -; GFX11-NEXT: v_bfe_u32 v9, v4, 16, 1 -; GFX11-NEXT: s_lshr_b32 s16, s16, 16 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc_lo -; GFX11-NEXT: v_readfirstlane_b32 s44, v10 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s16 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v7, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 24, v13 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v11, vcc_lo +; GFX11-NEXT: v_readfirstlane_b32 s16, v10 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s45 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v4 -; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s60 -; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v4 -; GFX11-NEXT: s_bfe_u32 s45, s44, 0x10010 -; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s61 -; GFX11-NEXT: s_add_i32 s45, s45, s44 -; GFX11-NEXT: s_bitset1_b32 s44, 22 -; GFX11-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: s_bfe_u32 s45, s16, 0x10010 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: s_add_i32 s56, s45, s16 +; GFX11-NEXT: s_lshr_b32 s45, s47, 16 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_bitset1_b32 s16, 22 ; GFX11-NEXT: s_and_b32 s47, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s44, s44, s45 +; GFX11-NEXT: s_cselect_b32 s16, s16, s56 +; GFX11-NEXT: s_and_b32 s47, s19, 0xffff0000 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s47 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_lshl_or_b32 v12, v3, 16, v7 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s44 +; GFX11-NEXT: v_bfe_u32 v11, v6, 16, 1 +; GFX11-NEXT: v_readfirstlane_b32 s47, v14 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v10, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: s_lshr_b32 s16, s16, 16 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s46 +; GFX11-NEXT: s_bfe_u32 s44, s47, 0x10010 +; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v6 +; GFX11-NEXT: s_add_i32 s44, s44, s47 +; GFX11-NEXT: s_bitset1_b32 s47, 22 +; GFX11-NEXT: s_addk_i32 s44, 0x7fff +; GFX11-NEXT: s_and_b32 s46, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s44, s47, s44 ; GFX11-NEXT: s_lshl_b32 s19, s19, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v9 -; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s19 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_bfe_u32 v9, v8, 16, 1 -; GFX11-NEXT: s_lshr_b32 s60, s44, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v1 -; GFX11-NEXT: v_readfirstlane_b32 s19, v10 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-NEXT: v_bfe_u32 v3, v5, 16, 1 -; GFX11-NEXT: v_add_nc_u32_e32 v4, v9, v8 -; GFX11-NEXT: s_bfe_u32 s45, s19, 0x10010 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; GFX11-NEXT: s_add_i32 s45, s45, s19 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v11 +; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s19 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_bfe_u32 v11, v10, 16, 1 +; GFX11-NEXT: s_lshr_b32 s59, s44, 16 +; GFX11-NEXT: v_readfirstlane_b32 s19, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_bfe_u32 v5, v7, 16, 1 +; GFX11-NEXT: s_bfe_u32 s46, s19, 0x10010 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v11, v10 +; GFX11-NEXT: s_add_i32 s46, s46, s19 ; GFX11-NEXT: s_bitset1_b32 s19, 22 -; GFX11-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-NEXT: s_addk_i32 s46, 0x7fff ; GFX11-NEXT: s_and_b32 s44, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s19, s19, s45 +; GFX11-NEXT: s_cselect_b32 s19, s19, s46 ; GFX11-NEXT: s_and_b32 s44, s18, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v1, v3, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 -; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v8 -; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s44 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v5, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v10 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s44 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v7 ; GFX11-NEXT: s_lshr_b32 s19, s19, 16 -; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s29 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s41 -; GFX11-NEXT: v_readfirstlane_b32 s41, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_pack_ll_b32_b16 s47, s17, s72 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; GFX11-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s29 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s41 +; GFX11-NEXT: v_readfirstlane_b32 s41, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 ; GFX11-NEXT: s_bfe_u32 s44, s41, 0x10010 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-NEXT: s_add_i32 s44, s44, s41 ; GFX11-NEXT: s_bitset1_b32 s41, 22 ; GFX11-NEXT: s_addk_i32 s44, 0x7fff -; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s40 -; GFX11-NEXT: s_and_b32 s45, vcc_lo, exec_lo +; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s40 +; GFX11-NEXT: s_and_b32 s46, vcc_lo, exec_lo ; GFX11-NEXT: s_cselect_b32 s41, s41, s44 ; GFX11-NEXT: s_lshl_b32 s18, s18, 16 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v31 -; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s18 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v32 -; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX11-NEXT: v_lshl_or_b32 v18, v30, 16, v4 -; GFX11-NEXT: v_readfirstlane_b32 s18, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_lshl_or_b32 v17, v1, 16, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v32 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s18 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_lshl_or_b32 v19, v31, 16, v6 +; GFX11-NEXT: v_readfirstlane_b32 s18, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_lshl_or_b32 v18, v3, 16, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v5 ; GFX11-NEXT: s_bfe_u32 s40, s18, 0x10010 -; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s28 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s28 ; GFX11-NEXT: s_add_i32 s44, s40, s18 ; GFX11-NEXT: s_lshr_b32 s40, s41, 16 ; GFX11-NEXT: s_addk_i32 s44, 0x7fff @@ -168374,105 +168487,105 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_and_b32 s41, vcc_lo, exec_lo ; GFX11-NEXT: s_cselect_b32 s18, s18, s44 ; GFX11-NEXT: s_and_b32 s41, s21, 0xffff0000 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s41 -; GFX11-NEXT: v_bfe_u32 v2, v9, 16, 1 -; GFX11-NEXT: s_lshr_b32 s18, s18, 16 -; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo -; GFX11-NEXT: v_readfirstlane_b32 s28, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v9 -; GFX11-NEXT: v_bfe_u32 v4, v8, 16, 1 -; GFX11-NEXT: v_bfe_u32 v5, v10, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s41 +; GFX11-NEXT: v_bfe_u32 v4, v11, 16, 1 +; GFX11-NEXT: s_lshr_b32 s18, s18, 16 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX11-NEXT: v_readfirstlane_b32 s28, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v11 +; GFX11-NEXT: v_bfe_u32 v6, v10, 16, 1 +; GFX11-NEXT: v_bfe_u32 v7, v14, 16, 1 ; GFX11-NEXT: s_bfe_u32 s29, s28, 0x10010 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v3 ; GFX11-NEXT: s_add_i32 s29, s29, s28 ; GFX11-NEXT: s_bitset1_b32 s28, 22 ; GFX11-NEXT: s_addk_i32 s29, 0x7fff ; GFX11-NEXT: s_and_b32 s41, vcc_lo, exec_lo ; GFX11-NEXT: s_cselect_b32 s28, s28, s29 ; GFX11-NEXT: s_lshl_b32 s21, s21, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s21 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-NEXT: s_lshr_b32 s61, s28, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v1, v5, v10 -; GFX11-NEXT: s_pack_ll_b32_b16 s44, s2, s11 -; GFX11-NEXT: v_readfirstlane_b32 s21, v11 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s21 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v3, v4, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: s_lshr_b32 s60, s28, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v7, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v81, 24, v19 +; GFX11-NEXT: v_readfirstlane_b32 s21, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v6, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 ; GFX11-NEXT: s_bfe_u32 s29, s21, 0x10010 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4 ; GFX11-NEXT: s_add_i32 s29, s29, s21 ; GFX11-NEXT: s_bitset1_b32 s21, 22 ; GFX11-NEXT: s_addk_i32 s29, 0x7fff ; GFX11-NEXT: s_and_b32 s28, vcc_lo, exec_lo ; GFX11-NEXT: s_cselect_b32 s21, s21, s29 ; GFX11-NEXT: s_and_b32 s28, s20, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3 -; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v8 -; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s28 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v10 -; GFX11-NEXT: s_lshr_b32 s21, s21, 16 -; GFX11-NEXT: s_pack_ll_b32_b16 s45, s3, s59 -; GFX11-NEXT: s_pack_ll_b32_b16 s46, s16, s46 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v10 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s28 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s13 -; GFX11-NEXT: v_readfirstlane_b32 s13, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v81, 24, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v14 +; GFX11-NEXT: s_lshr_b32 s21, s21, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 24, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s13 +; GFX11-NEXT: v_readfirstlane_b32 s13, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-NEXT: s_bfe_u32 s28, s13, 0x10010 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v34 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v35 ; GFX11-NEXT: s_add_i32 s28, s28, s13 ; GFX11-NEXT: s_bitset1_b32 s13, 22 ; GFX11-NEXT: s_addk_i32 s28, 0x7fff ; GFX11-NEXT: s_and_b32 s29, vcc_lo, exec_lo ; GFX11-NEXT: s_cselect_b32 s13, s13, s28 ; GFX11-NEXT: s_lshl_b32 s20, s20, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v1 -; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s20 -; GFX11-NEXT: v_bfe_u32 v1, v3, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s10 -; GFX11-NEXT: v_lshl_or_b32 v20, v33, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v35 -; GFX11-NEXT: v_readfirstlane_b32 s20, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 -; GFX11-NEXT: v_lshl_or_b32 v19, v2, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s20 +; GFX11-NEXT: v_bfe_u32 v3, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s10 +; GFX11-NEXT: v_lshl_or_b32 v21, v34, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v36 +; GFX11-NEXT: v_readfirstlane_b32 s20, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_bfe_u32 v6, v7, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v20, v4, 16, v11 ; GFX11-NEXT: s_bfe_u32 s10, s20, 0x10010 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 ; GFX11-NEXT: s_add_i32 s28, s10, s20 ; GFX11-NEXT: s_lshr_b32 s10, s13, 16 ; GFX11-NEXT: s_addk_i32 s28, 0x7fff ; GFX11-NEXT: s_bitset1_b32 s20, 22 -; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5 ; GFX11-NEXT: s_and_b32 s13, vcc_lo, exec_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-NEXT: s_cselect_b32 s13, s20, s28 ; GFX11-NEXT: s_and_b32 s20, s23, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 -; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s42 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s20 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5 -; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s43 -; GFX11-NEXT: v_readfirstlane_b32 s28, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v7 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s42 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s20 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s43 +; GFX11-NEXT: v_readfirstlane_b32 s28, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v20 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-NEXT: s_bfe_u32 s20, s28, 0x10010 -; GFX11-NEXT: v_bfe_u32 v4, v8, 16, 1 +; GFX11-NEXT: v_bfe_u32 v6, v10, 16, 1 ; GFX11-NEXT: s_add_i32 s29, s20, s28 ; GFX11-NEXT: s_lshr_b32 s20, s13, 16 ; GFX11-NEXT: s_addk_i32 s29, 0x7fff @@ -168480,235 +168593,241 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_and_b32 s13, vcc_lo, exec_lo ; GFX11-NEXT: s_cselect_b32 s13, s28, s29 ; GFX11-NEXT: s_lshl_b32 s23, s23, 16 -; GFX11-NEXT: v_bfe_u32 v5, v9, 16, 1 -; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s23 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v3, v4, v8 -; GFX11-NEXT: s_lshr_b32 s62, s13, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v1, v5, v9 -; GFX11-NEXT: v_readfirstlane_b32 s23, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_bfe_u32 v7, v11, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s23 +; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v6, v10 +; GFX11-NEXT: s_lshr_b32 s63, s13, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v7, v11 +; GFX11-NEXT: v_readfirstlane_b32 s23, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v10 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 -; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 ; GFX11-NEXT: s_bfe_u32 s28, s23, 0x10010 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11 ; GFX11-NEXT: s_add_i32 s28, s28, s23 ; GFX11-NEXT: s_bitset1_b32 s23, 22 ; GFX11-NEXT: s_addk_i32 s28, 0x7fff ; GFX11-NEXT: s_and_b32 s13, vcc_lo, exec_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 ; GFX11-NEXT: s_cselect_b32 s13, s23, s28 ; GFX11-NEXT: s_and_b32 s23, s22, 0xffff0000 -; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s15 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v36 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s23 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s14 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s15 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v37 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s23 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s14 ; GFX11-NEXT: s_lshr_b32 s23, s13, 16 -; GFX11-NEXT: v_bfe_u32 v9, v8, 16, 1 -; GFX11-NEXT: v_readfirstlane_b32 s14, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_lshl_or_b32 v71, v37, 16, v4 -; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s12 +; GFX11-NEXT: v_bfe_u32 v11, v10, 16, 1 +; GFX11-NEXT: v_readfirstlane_b32 s14, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v14, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_lshl_or_b32 v71, v38, 16, v6 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s12 ; GFX11-NEXT: s_bfe_u32 s15, s14, 0x10010 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v3 ; GFX11-NEXT: s_add_i32 s15, s15, s14 ; GFX11-NEXT: s_bitset1_b32 s14, 22 ; GFX11-NEXT: s_addk_i32 s15, 0x7fff ; GFX11-NEXT: s_and_b32 s13, vcc_lo, exec_lo ; GFX11-NEXT: s_cselect_b32 s13, s14, s15 ; GFX11-NEXT: s_lshl_b32 s14, s22, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s14 -; GFX11-NEXT: v_bfe_u32 v1, v5, 16, 1 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v38 -; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v8 -; GFX11-NEXT: s_lshr_b32 s13, s13, 16 -; GFX11-NEXT: v_readfirstlane_b32 s14, v10 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v5 -; GFX11-NEXT: v_lshl_or_b32 v70, v2, 16, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s14 +; GFX11-NEXT: v_bfe_u32 v3, v7, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v10 +; GFX11-NEXT: s_lshr_b32 s28, s13, 16 +; GFX11-NEXT: v_readfirstlane_b32 s14, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v7 +; GFX11-NEXT: v_lshl_or_b32 v70, v4, 16, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v11 ; GFX11-NEXT: s_bfe_u32 s12, s14, 0x10010 -; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v8 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v10 ; GFX11-NEXT: s_add_i32 s12, s12, s14 ; GFX11-NEXT: s_bitset1_b32 s14, 22 ; GFX11-NEXT: s_addk_i32 s12, 0x7fff -; GFX11-NEXT: s_and_b32 s15, vcc_lo, exec_lo +; GFX11-NEXT: s_and_b32 s13, vcc_lo, exec_lo ; GFX11-NEXT: s_cselect_b32 s12, s14, s12 -; GFX11-NEXT: s_and_b32 s14, s25, 0xffff0000 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s14 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s9 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX11-NEXT: v_readfirstlane_b32 s9, v10 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_lshr_b32 s22, s12, 16 -; GFX11-NEXT: v_bfe_u32 v3, v4, 16, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; GFX11-NEXT: s_bfe_u32 s14, s9, 0x10010 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo +; GFX11-NEXT: s_and_b32 s13, s25, 0xffff0000 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-NEXT: s_add_i32 s14, s14, s9 +; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s13 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s9 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX11-NEXT: v_readfirstlane_b32 s9, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_lshr_b32 s22, s12, 16 +; GFX11-NEXT: v_bfe_u32 v5, v6, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; GFX11-NEXT: s_bfe_u32 s13, s9, 0x10010 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: s_add_i32 s13, s13, s9 ; GFX11-NEXT: s_bitset1_b32 s9, 22 -; GFX11-NEXT: s_addk_i32 s14, 0x7fff -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-NEXT: s_addk_i32 s13, 0x7fff +; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v3 ; GFX11-NEXT: s_and_b32 s12, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s9, s9, s14 +; GFX11-NEXT: s_cselect_b32 s9, s9, s13 ; GFX11-NEXT: s_lshl_b32 s12, s25, 16 -; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s8 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s12 -; GFX11-NEXT: v_add_nc_u32_e32 v2, v3, v4 -; GFX11-NEXT: s_lshr_b32 s63, s9, 16 -; GFX11-NEXT: v_bfe_u32 v3, v8, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v4 -; GFX11-NEXT: v_readfirstlane_b32 s8, v1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v8 -; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s8 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s12 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v6 +; GFX11-NEXT: s_lshr_b32 s72, s9, 16 +; GFX11-NEXT: v_bfe_u32 v5, v10, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v6 +; GFX11-NEXT: v_readfirstlane_b32 s8, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v10 +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v10 ; GFX11-NEXT: s_bfe_u32 s12, s8, 0x10010 -; GFX11-NEXT: v_bfe_u32 v12, v9, 16, 1 +; GFX11-NEXT: v_bfe_u32 v16, v11, 16, 1 ; GFX11-NEXT: s_add_i32 s12, s12, s8 ; GFX11-NEXT: s_bitset1_b32 s8, 22 ; GFX11-NEXT: s_addk_i32 s12, 0x7fff ; GFX11-NEXT: s_and_b32 s9, vcc_lo, exec_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-NEXT: s_cselect_b32 s8, s8, s12 ; GFX11-NEXT: s_and_b32 s9, s24, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 ; GFX11-NEXT: s_lshr_b32 s25, s8, 16 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v10, vcc_lo -; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s9 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 -; GFX11-NEXT: v_add_nc_u32_e32 v4, v12, v9 -; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s6 -; GFX11-NEXT: v_readfirstlane_b32 s7, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v9 -; GFX11-NEXT: s_pack_ll_b32_b16 s28, s0, s5 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v14, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v16, v11 +; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s6 +; GFX11-NEXT: v_readfirstlane_b32 s7, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v15, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: s_bfe_u32 s9, s7, 0x10010 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5 ; GFX11-NEXT: s_add_i32 s9, s9, s7 ; GFX11-NEXT: s_bitset1_b32 s7, 22 ; GFX11-NEXT: s_addk_i32 s9, 0x7fff ; GFX11-NEXT: s_and_b32 s8, vcc_lo, exec_lo ; GFX11-NEXT: s_cselect_b32 s7, s7, s9 ; GFX11-NEXT: s_lshl_b32 s8, s24, 16 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4 -; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s8 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 -; GFX11-NEXT: v_bfe_u32 v4, v8, 16, 1 -; GFX11-NEXT: s_lshr_b32 s12, s7, 16 -; GFX11-NEXT: v_readfirstlane_b32 s8, v10 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v8 -; GFX11-NEXT: v_bfe_u32 v10, v12, 16, 1 -; GFX11-NEXT: s_bfe_u32 s4, s8, 0x10010 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v2 -; GFX11-NEXT: s_add_i32 s4, s4, s8 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v6 +; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s8 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s5 +; GFX11-NEXT: v_bfe_u32 v6, v10, 16, 1 +; GFX11-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-NEXT: v_readfirstlane_b32 s8, v14 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v10 +; GFX11-NEXT: v_bfe_u32 v14, v16, 16, 1 +; GFX11-NEXT: s_bfe_u32 s5, s8, 0x10010 +; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-NEXT: s_add_i32 s5, s5, s8 ; GFX11-NEXT: s_bitset1_b32 s8, 22 -; GFX11-NEXT: s_addk_i32 s4, 0x7fff +; GFX11-NEXT: s_addk_i32 s5, 0x7fff ; GFX11-NEXT: s_and_b32 s6, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s4, s8, s4 +; GFX11-NEXT: s_cselect_b32 s5, s8, s5 ; GFX11-NEXT: s_and_b32 s6, s27, 0xffff0000 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4 -; GFX11-NEXT: v_add_f32_e64 v52, 0x40c00000, s6 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v12 -; GFX11-NEXT: s_lshr_b32 s24, s4, 16 -; GFX11-NEXT: v_readfirstlane_b32 s6, v52 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-NEXT: v_bfe_u32 v4, v9, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v6 +; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s6 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v10 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v10, v14, v16 +; GFX11-NEXT: s_lshr_b32 s24, s5, 16 +; GFX11-NEXT: v_readfirstlane_b32 s6, v17 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_bfe_u32 v6, v11, 16, 1 ; GFX11-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v50 ; GFX11-NEXT: s_add_i32 s7, s7, s6 ; GFX11-NEXT: s_bitset1_b32 s6, 22 ; GFX11-NEXT: s_addk_i32 s7, 0x7fff -; GFX11-NEXT: s_and_b32 s4, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s4, s6, s7 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s5, s6, s7 ; GFX11-NEXT: s_lshl_b32 s6, s27, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, v4, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v8 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v12 -; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-NEXT: s_lshr_b32 s73, s4, 16 -; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v49 -; GFX11-NEXT: v_readfirstlane_b32 s6, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GFX11-NEXT: v_lshl_or_b32 v66, v1, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v10 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v16 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: s_lshr_b32 s73, s5, 16 +; GFX11-NEXT: v_lshl_or_b32 v66, v3, 16, v15 +; GFX11-NEXT: v_readfirstlane_b32 s6, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v52 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v48 ; GFX11-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 ; GFX11-NEXT: s_add_i32 s7, s7, s6 ; GFX11-NEXT: s_bitset1_b32 s6, 22 ; GFX11-NEXT: s_addk_i32 s7, 0x7fff -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_and_b32 s4, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s4, s6, s7 -; GFX11-NEXT: s_and_b32 s6, s26, 0xffff0000 -; GFX11-NEXT: s_lshr_b32 s27, s4, 16 -; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s6 -; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v52 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v39 -; GFX11-NEXT: v_lshl_or_b32 v55, v50, 16, v4 -; GFX11-NEXT: s_pack_ll_b32_b16 s8, s22, s13 -; GFX11-NEXT: v_readfirstlane_b32 s6, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s6, s6, s7 +; GFX11-NEXT: s_and_b32 s5, s26, 0xffff0000 +; GFX11-NEXT: s_lshr_b32 s27, s6, 16 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s5 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v53 +; GFX11-NEXT: s_pack_ll_b32_b16 s14, s22, s28 +; GFX11-NEXT: v_lshl_or_b32 v55, v51, 16, v6 +; GFX11-NEXT: v_lshl_or_b32 v67, v49, 16, v7 +; GFX11-NEXT: v_readfirstlane_b32 s8, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_lshl_or_b32 v54, v4, 16, v10 +; GFX11-NEXT: s_pack_ll_b32_b16 s12, s18, s40 +; GFX11-NEXT: v_lshrrev_b64 v[14:15], 24, v[12:13] +; GFX11-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX11-NEXT: v_lshrrev_b64 v[15:16], 24, v[8:9] +; GFX11-NEXT: s_add_i32 s9, s9, s8 +; GFX11-NEXT: s_bitset1_b32 s8, 22 +; GFX11-NEXT: s_addk_i32 s9, 0x7fff +; GFX11-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s29, s8, s9 +; GFX11-NEXT: s_lshl_b32 s8, s26, 16 +; GFX11-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s8 +; GFX11-NEXT: v_lshrrev_b64 v[10:11], 24, v[18:19] +; GFX11-NEXT: v_lshrrev_b64 v[16:17], 24, v[1:2] +; GFX11-NEXT: s_pack_ll_b32_b16 s5, s1, s61 +; GFX11-NEXT: s_pack_ll_b32_b16 s7, s3, s62 +; GFX11-NEXT: v_readfirstlane_b32 s26, v3 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_lshl_or_b32 v54, v2, 16, v8 -; GFX11-NEXT: v_lshl_or_b32 v67, v48, 16, v5 -; GFX11-NEXT: v_lshrrev_b64 v[8:9], 24, v[17:18] -; GFX11-NEXT: s_bfe_u32 s5, s6, 0x10010 -; GFX11-NEXT: v_lshrrev_b64 v[9:10], 24, v[15:16] -; GFX11-NEXT: s_add_i32 s5, s5, s6 -; GFX11-NEXT: s_bitset1_b32 s6, 22 -; GFX11-NEXT: s_addk_i32 s5, 0x7fff -; GFX11-NEXT: s_and_b32 s4, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s14, s6, s5 -; GFX11-NEXT: s_lshl_b32 s4, s26, 16 -; GFX11-NEXT: s_pack_ll_b32_b16 s6, s20, s10 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 -; GFX11-NEXT: s_lshr_b32 s13, s14, 16 -; GFX11-NEXT: v_lshrrev_b64 v[10:11], 24, v[13:14] -; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[6:7] -; GFX11-NEXT: s_pack_ll_b32_b16 s29, s1, s58 -; GFX11-NEXT: v_readfirstlane_b32 s11, v1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_lshrrev_b64 v[1:2], 24, v[54:55] -; GFX11-NEXT: v_lshrrev_b64 v[2:3], 24, v[66:67] -; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[70:71] -; GFX11-NEXT: s_bfe_u32 s10, s11, 0x10010 -; GFX11-NEXT: v_lshrrev_b64 v[4:5], 24, v[19:20] -; GFX11-NEXT: s_add_i32 s10, s10, s11 -; GFX11-NEXT: s_bitset1_b32 s11, 22 -; GFX11-NEXT: s_addk_i32 s10, 0x7fff -; GFX11-NEXT: s_and_b32 s14, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s10, s11, s10 -; GFX11-NEXT: s_pack_ll_b32_b16 s5, s19, s60 -; GFX11-NEXT: s_lshr_b32 s26, s10, 16 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s18, s40 -; GFX11-NEXT: s_pack_ll_b32_b16 s9, s23, s62 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v55 -; GFX11-NEXT: v_lshrrev_b32_e32 v12, 8, v55 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v54 +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[54:55] +; GFX11-NEXT: v_lshrrev_b64 v[4:5], 24, v[66:67] +; GFX11-NEXT: v_lshrrev_b64 v[5:6], 24, v[70:71] +; GFX11-NEXT: s_bfe_u32 s28, s26, 0x10010 +; GFX11-NEXT: v_lshrrev_b64 v[6:7], 24, v[20:21] +; GFX11-NEXT: s_add_i32 s28, s28, s26 +; GFX11-NEXT: s_bitset1_b32 s26, 22 +; GFX11-NEXT: s_addk_i32 s28, 0x7fff +; GFX11-NEXT: s_and_b32 s29, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s26, s26, s28 +; GFX11-NEXT: s_pack_ll_b32_b16 s6, s2, s11 +; GFX11-NEXT: s_lshr_b32 s26, s26, 16 +; GFX11-NEXT: s_pack_ll_b32_b16 s13, s19, s59 +; GFX11-NEXT: s_pack_ll_b32_b16 s11, s21, s60 +; GFX11-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX11-NEXT: s_pack_ll_b32_b16 s15, s23, s63 +; GFX11-NEXT: s_pack_ll_b32_b16 s29, s25, s72 +; GFX11-NEXT: s_pack_ll_b32_b16 s41, s27, s73 +; GFX11-NEXT: s_pack_ll_b32_b16 s40, s26, s40 +; GFX11-NEXT: s_pack_ll_b32_b16 s28, s24, s42 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v55 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 8, v55 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v54 ; GFX11-NEXT: v_lshrrev_b32_e32 v54, 8, v54 ; GFX11-NEXT: v_lshrrev_b32_e32 v55, 24, v67 ; GFX11-NEXT: v_lshrrev_b32_e32 v64, 8, v67 @@ -168718,120 +168837,114 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: v_lshrrev_b32_e32 v68, 8, v71 ; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v70 ; GFX11-NEXT: v_lshrrev_b32_e32 v70, 8, v70 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 24, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 24, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 8, v21 ; GFX11-NEXT: v_lshrrev_b32_e32 v20, 8, v20 ; GFX11-NEXT: v_lshrrev_b32_e32 v19, 8, v19 ; GFX11-NEXT: v_lshrrev_b32_e32 v18, 8, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v17, 8, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 8, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 8, v14 ; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 8, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 8, v6 -; GFX11-NEXT: s_pack_ll_b32_b16 s7, s21, s61 -; GFX11-NEXT: s_pack_ll_b32_b16 s11, s25, s63 -; GFX11-NEXT: s_pack_ll_b32_b16 s57, s27, s73 -; GFX11-NEXT: s_pack_ll_b32_b16 s56, s26, s13 -; GFX11-NEXT: s_pack_ll_b32_b16 s10, s24, s12 -; GFX11-NEXT: s_lshr_b64 s[94:95], s[8:9], 24 -; GFX11-NEXT: s_lshr_b64 s[12:13], s[4:5], 24 -; GFX11-NEXT: s_lshr_b64 s[14:15], s[46:47], 24 -; GFX11-NEXT: s_lshr_b64 s[40:41], s[44:45], 24 -; GFX11-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 -; GFX11-NEXT: s_lshr_b64 vcc, s[56:57], 24 -; GFX11-NEXT: s_lshr_b64 s[34:35], s[10:11], 24 -; GFX11-NEXT: s_lshr_b64 s[30:31], s[6:7], 24 -; GFX11-NEXT: s_lshr_b32 s13, s57, 24 -; GFX11-NEXT: s_lshr_b32 s15, s57, 8 -; GFX11-NEXT: s_lshr_b32 s41, s56, 16 -; GFX11-NEXT: s_lshr_b32 s43, s56, 8 -; GFX11-NEXT: s_lshr_b32 s56, s11, 24 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX11-NEXT: s_pack_ll_b32_b16 s9, s17, s58 +; GFX11-NEXT: s_pack_ll_b32_b16 s8, s16, s45 +; GFX11-NEXT: s_lshr_b64 s[92:93], s[40:41], 24 +; GFX11-NEXT: s_lshr_b64 vcc, s[28:29], 24 +; GFX11-NEXT: s_lshr_b64 s[90:91], s[14:15], 24 +; GFX11-NEXT: s_lshr_b64 s[94:95], s[10:11], 24 +; GFX11-NEXT: s_lshr_b64 s[42:43], s[12:13], 24 +; GFX11-NEXT: s_lshr_b64 s[46:47], s[6:7], 24 +; GFX11-NEXT: s_lshr_b64 s[56:57], s[4:5], 24 +; GFX11-NEXT: s_lshr_b64 s[30:31], s[8:9], 24 +; GFX11-NEXT: s_lshr_b32 s43, s41, 24 +; GFX11-NEXT: s_lshr_b32 s41, s41, 8 +; GFX11-NEXT: s_lshr_b32 s45, s40, 16 +; GFX11-NEXT: s_lshr_b32 s40, s40, 8 +; GFX11-NEXT: s_lshr_b32 s47, s29, 24 +; GFX11-NEXT: s_lshr_b32 s29, s29, 8 +; GFX11-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-NEXT: s_lshr_b32 s28, s28, 8 +; GFX11-NEXT: s_lshr_b32 s74, s15, 24 +; GFX11-NEXT: s_lshr_b32 s15, s15, 8 +; GFX11-NEXT: s_lshr_b32 s75, s14, 16 +; GFX11-NEXT: s_lshr_b32 s14, s14, 8 +; GFX11-NEXT: s_lshr_b32 s76, s11, 24 ; GFX11-NEXT: s_lshr_b32 s11, s11, 8 -; GFX11-NEXT: s_lshr_b32 s57, s10, 16 +; GFX11-NEXT: s_lshr_b32 s77, s10, 16 ; GFX11-NEXT: s_lshr_b32 s10, s10, 8 -; GFX11-NEXT: s_lshr_b32 s74, s9, 24 +; GFX11-NEXT: s_lshr_b32 s78, s13, 24 +; GFX11-NEXT: s_lshr_b32 s13, s13, 8 +; GFX11-NEXT: s_lshr_b32 s79, s12, 16 +; GFX11-NEXT: s_lshr_b32 s12, s12, 8 +; GFX11-NEXT: s_lshr_b32 s88, s9, 24 ; GFX11-NEXT: s_lshr_b32 s9, s9, 8 -; GFX11-NEXT: s_lshr_b32 s75, s8, 16 +; GFX11-NEXT: s_lshr_b32 s89, s8, 16 ; GFX11-NEXT: s_lshr_b32 s8, s8, 8 -; GFX11-NEXT: s_lshr_b32 s76, s7, 24 -; GFX11-NEXT: s_lshr_b32 s77, s7, 8 -; GFX11-NEXT: s_lshr_b32 s78, s6, 16 -; GFX11-NEXT: s_lshr_b32 s79, s6, 8 -; GFX11-NEXT: s_lshr_b32 s88, s5, 24 -; GFX11-NEXT: s_lshr_b32 s89, s5, 8 -; GFX11-NEXT: s_lshr_b32 s90, s4, 16 -; GFX11-NEXT: s_lshr_b32 s91, s4, 8 -; GFX11-NEXT: s_lshr_b32 s92, s47, 24 -; GFX11-NEXT: s_lshr_b32 s47, s47, 8 -; GFX11-NEXT: s_lshr_b32 s93, s46, 16 -; GFX11-NEXT: s_lshr_b32 s46, s46, 8 -; GFX11-NEXT: s_lshr_b32 s95, s45, 24 -; GFX11-NEXT: s_lshr_b32 s45, s45, 8 -; GFX11-NEXT: s_lshr_b32 s99, s44, 16 -; GFX11-NEXT: s_lshr_b32 s100, s44, 8 -; GFX11-NEXT: s_lshr_b32 s101, s29, 24 -; GFX11-NEXT: s_lshr_b32 s102, s29, 8 -; GFX11-NEXT: s_lshr_b32 s103, s28, 16 -; GFX11-NEXT: s_lshr_b32 s104, s28, 8 +; GFX11-NEXT: s_lshr_b32 s91, s7, 24 +; GFX11-NEXT: s_lshr_b32 s95, s7, 8 +; GFX11-NEXT: s_lshr_b32 s93, s6, 16 +; GFX11-NEXT: s_lshr_b32 vcc_hi, s6, 8 +; GFX11-NEXT: s_lshr_b32 s7, s5, 24 +; GFX11-NEXT: s_lshr_b32 s99, s5, 8 +; GFX11-NEXT: s_lshr_b32 s44, s4, 16 +; GFX11-NEXT: s_lshr_b32 s104, s4, 8 ; GFX11-NEXT: s_branch .LBB91_5 ; GFX11-NEXT: .LBB91_3: -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr74 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; kill: killed $sgpr4 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 ; GFX11-NEXT: ; implicit-def: $sgpr104 -; GFX11-NEXT: ; implicit-def: $sgpr103 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr102 -; GFX11-NEXT: ; implicit-def: $sgpr11 -; GFX11-NEXT: ; implicit-def: $sgpr101 -; GFX11-NEXT: ; implicit-def: $sgpr100 +; GFX11-NEXT: ; implicit-def: $sgpr44 +; GFX11-NEXT: ; implicit-def: $sgpr56 ; GFX11-NEXT: ; implicit-def: $sgpr99 -; GFX11-NEXT: ; implicit-def: $sgpr40 -; GFX11-NEXT: ; implicit-def: $sgpr14 -; GFX11-NEXT: ; implicit-def: $sgpr12 -; GFX11-NEXT: ; implicit-def: $sgpr49 +; GFX11-NEXT: ; implicit-def: $sgpr41 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr37 +; GFX11-NEXT: ; implicit-def: $sgpr49 ; GFX11-NEXT: ; implicit-def: $sgpr35 -; GFX11-NEXT: ; implicit-def: $sgpr6 +; GFX11-NEXT: ; implicit-def: $sgpr12 ; GFX11-NEXT: ; implicit-def: $sgpr34 +; GFX11-NEXT: ; implicit-def: $sgpr51 ; GFX11-NEXT: ; implicit-def: $sgpr52 -; GFX11-NEXT: ; implicit-def: $sgpr53 ; GFX11-NEXT: ; implicit-def: $sgpr50 -; GFX11-NEXT: ; implicit-def: $sgpr7 +; GFX11-NEXT: ; implicit-def: $sgpr13 ; GFX11-NEXT: ; implicit-def: $sgpr36 +; GFX11-NEXT: ; implicit-def: $sgpr55 ; GFX11-NEXT: ; implicit-def: $sgpr64 -; GFX11-NEXT: ; implicit-def: $sgpr38 -; GFX11-NEXT: ; implicit-def: $sgpr54 +; GFX11-NEXT: ; implicit-def: $sgpr53 ; GFX11-NEXT: ; implicit-def: $sgpr96 -; GFX11-NEXT: ; implicit-def: $sgpr51 +; GFX11-NEXT: ; implicit-def: $sgpr38 +; GFX11-NEXT: ; implicit-def: $sgpr66 ; GFX11-NEXT: ; implicit-def: $sgpr67 -; GFX11-NEXT: ; implicit-def: $sgpr68 -; GFX11-NEXT: ; implicit-def: $sgpr65 -; GFX11-NEXT: ; implicit-def: $sgpr8 -; GFX11-NEXT: ; implicit-def: $sgpr55 ; GFX11-NEXT: ; implicit-def: $sgpr39 +; GFX11-NEXT: ; implicit-def: $sgpr14 +; GFX11-NEXT: ; implicit-def: $sgpr54 +; GFX11-NEXT: ; implicit-def: $sgpr70 ; GFX11-NEXT: ; implicit-def: $sgpr71 -; GFX11-NEXT: ; implicit-def: $sgpr69 -; GFX11-NEXT: ; implicit-def: $sgpr9 -; GFX11-NEXT: ; implicit-def: $sgpr66 +; GFX11-NEXT: ; implicit-def: $sgpr68 +; GFX11-NEXT: ; implicit-def: $sgpr15 +; GFX11-NEXT: ; implicit-def: $sgpr65 +; GFX11-NEXT: ; implicit-def: $sgpr81 ; GFX11-NEXT: ; implicit-def: $sgpr82 -; GFX11-NEXT: ; implicit-def: $sgpr83 ; GFX11-NEXT: ; implicit-def: $sgpr80 ; GFX11-NEXT: ; implicit-def: $sgpr97 -; GFX11-NEXT: ; implicit-def: $sgpr70 -; GFX11-NEXT: ; implicit-def: $sgpr48 -; GFX11-NEXT: ; implicit-def: $sgpr84 +; GFX11-NEXT: ; implicit-def: $sgpr69 +; GFX11-NEXT: ; implicit-def: $sgpr85 +; GFX11-NEXT: ; implicit-def: $sgpr83 ; GFX11-NEXT: ; implicit-def: $sgpr98 -; GFX11-NEXT: ; implicit-def: $sgpr81 +; GFX11-NEXT: ; implicit-def: $sgpr48 ; GFX11-NEXT: ; implicit-def: $sgpr86 ; GFX11-NEXT: ; implicit-def: $sgpr87 -; GFX11-NEXT: ; implicit-def: $sgpr10 -; GFX11-NEXT: ; implicit-def: $sgpr85 +; GFX11-NEXT: ; implicit-def: $sgpr40 +; GFX11-NEXT: ; implicit-def: $sgpr84 +; GFX11-NEXT: ; implicit-def: $sgpr102 +; GFX11-NEXT: ; implicit-def: $sgpr100 ; GFX11-NEXT: ; implicit-def: $sgpr30 ; GFX11-NEXT: ; implicit-def: $sgpr94 ; GFX11-NEXT: ; implicit-def: $sgpr92 @@ -168839,169 +168952,176 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: ; implicit-def: $sgpr88 ; GFX11-NEXT: ; implicit-def: $sgpr78 ; GFX11-NEXT: ; implicit-def: $sgpr76 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: v_writelane_b32 v43, s4, 0 -; GFX11-NEXT: v_writelane_b32 v43, s5, 1 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: v_writelane_b32 v43, s4, 2 -; GFX11-NEXT: v_writelane_b32 v43, s5, 3 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: v_writelane_b32 v43, s74, 4 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: v_writelane_b32 v43, s75, 5 -; GFX11-NEXT: ; implicit-def: $sgpr74 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; kill: killed $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr4 -; GFX11-NEXT: ; implicit-def: $sgpr5 -; GFX11-NEXT: v_writelane_b32 v43, s74, 6 -; GFX11-NEXT: v_writelane_b32 v43, s75, 7 ; GFX11-NEXT: ; implicit-def: $sgpr74 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: v_writelane_b32 v42, s10, 0 +; GFX11-NEXT: v_writelane_b32 v42, s11, 1 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: v_writelane_b32 v42, s10, 2 +; GFX11-NEXT: v_writelane_b32 v42, s11, 3 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: v_writelane_b32 v42, s10, 4 +; GFX11-NEXT: v_writelane_b32 v42, s11, 5 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; kill: killed $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr10 +; GFX11-NEXT: ; implicit-def: $sgpr11 ; GFX11-NEXT: s_branch .LBB91_2 ; GFX11-NEXT: .LBB91_4: -; GFX11-NEXT: v_dual_mov_b32 v10, s94 :: v_dual_mov_b32 v11, s30 -; GFX11-NEXT: v_readlane_b32 s94, v43, 2 -; GFX11-NEXT: v_dual_mov_b32 v96, s37 :: v_dual_mov_b32 v87, s34 -; GFX11-NEXT: v_dual_mov_b32 v6, s49 :: v_dual_mov_b32 v7, s35 -; GFX11-NEXT: v_readlane_b32 s95, v43, 3 -; GFX11-NEXT: v_readlane_b32 vcc_lo, v43, 6 -; GFX11-NEXT: v_readlane_b32 s30, v43, 0 -; GFX11-NEXT: v_readlane_b32 s34, v43, 4 -; GFX11-NEXT: v_dual_mov_b32 v52, s44 :: v_dual_mov_b32 v51, s45 -; GFX11-NEXT: v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v49, s46 -; GFX11-NEXT: v_dual_mov_b32 v39, s47 :: v_dual_mov_b32 v48, s98 -; GFX11-NEXT: v_dual_mov_b32 v38, s56 :: v_dual_mov_b32 v37, s97 -; GFX11-NEXT: v_dual_mov_b32 v36, s57 :: v_dual_mov_b32 v35, s58 -; GFX11-NEXT: v_dual_mov_b32 v34, s59 :: v_dual_mov_b32 v33, s9 -; GFX11-NEXT: v_dual_mov_b32 v32, s60 :: v_dual_mov_b32 v31, s61 -; GFX11-NEXT: v_dual_mov_b32 v30, s8 :: v_dual_mov_b32 v29, s62 -; GFX11-NEXT: v_dual_mov_b32 v27, s63 :: v_dual_mov_b32 v28, s96 -; GFX11-NEXT: v_dual_mov_b32 v26, s72 :: v_dual_mov_b32 v25, s7 -; GFX11-NEXT: v_dual_mov_b32 v24, s73 :: v_dual_mov_b32 v23, s28 -; GFX11-NEXT: v_dual_mov_b32 v21, s29 :: v_dual_mov_b32 v22, s6 -; GFX11-NEXT: v_dual_mov_b32 v53, s87 :: v_dual_mov_b32 v54, s86 -; GFX11-NEXT: v_dual_mov_b32 v5, s85 :: v_dual_mov_b32 v12, s5 -; GFX11-NEXT: v_dual_mov_b32 v65, s4 :: v_dual_mov_b32 v66, s48 -; GFX11-NEXT: v_dual_mov_b32 v55, s81 :: v_dual_mov_b32 v64, s84 -; GFX11-NEXT: v_dual_mov_b32 v69, s83 :: v_dual_mov_b32 v70, s82 -; GFX11-NEXT: v_dual_mov_b32 v67, s70 :: v_dual_mov_b32 v68, s80 -; GFX11-NEXT: v_dual_mov_b32 v80, s71 :: v_dual_mov_b32 v19, s39 -; GFX11-NEXT: v_dual_mov_b32 v71, s66 :: v_dual_mov_b32 v20, s69 -; GFX11-NEXT: v_dual_mov_b32 v82, s68 :: v_dual_mov_b32 v17, s67 -; GFX11-NEXT: v_dual_mov_b32 v81, s55 :: v_dual_mov_b32 v18, s65 -; GFX11-NEXT: v_dual_mov_b32 v84, s38 :: v_dual_mov_b32 v15, s64 -; GFX11-NEXT: v_dual_mov_b32 v83, s51 :: v_dual_mov_b32 v16, s54 -; GFX11-NEXT: v_dual_mov_b32 v86, s53 :: v_dual_mov_b32 v13, s52 -; GFX11-NEXT: v_dual_mov_b32 v85, s36 :: v_dual_mov_b32 v14, s50 -; GFX11-NEXT: v_dual_mov_b32 v1, s74 :: v_dual_mov_b32 v2, s76 -; GFX11-NEXT: v_dual_mov_b32 v3, s78 :: v_dual_mov_b32 v4, s88 -; GFX11-NEXT: v_dual_mov_b32 v8, s90 :: v_dual_mov_b32 v9, s92 -; GFX11-NEXT: s_mov_b32 s58, s11 -; GFX11-NEXT: v_readlane_b32 s59, v43, 8 -; GFX11-NEXT: v_readlane_b32 s72, v43, 9 -; GFX11-NEXT: v_readlane_b32 s60, v43, 10 -; GFX11-NEXT: v_readlane_b32 s61, v43, 11 -; GFX11-NEXT: v_readlane_b32 s62, v43, 12 -; GFX11-NEXT: v_readlane_b32 s63, v43, 13 -; GFX11-NEXT: v_readlane_b32 s73, v43, 14 -; GFX11-NEXT: v_readlane_b32 s13, v43, 15 -; GFX11-NEXT: v_readlane_b32 s15, v43, 16 -; GFX11-NEXT: v_readlane_b32 s41, v43, 17 -; GFX11-NEXT: v_readlane_b32 s43, v43, 18 -; GFX11-NEXT: v_readlane_b32 s56, v43, 19 -; GFX11-NEXT: v_readlane_b32 s11, v43, 20 -; GFX11-NEXT: v_readlane_b32 s57, v43, 21 -; GFX11-NEXT: v_readlane_b32 s10, v43, 22 -; GFX11-NEXT: v_readlane_b32 s74, v43, 23 -; GFX11-NEXT: v_readlane_b32 s9, v43, 24 -; GFX11-NEXT: v_readlane_b32 s75, v43, 25 -; GFX11-NEXT: v_readlane_b32 s8, v43, 26 -; GFX11-NEXT: v_readlane_b32 s76, v43, 27 -; GFX11-NEXT: v_readlane_b32 s77, v43, 28 -; GFX11-NEXT: v_readlane_b32 s78, v43, 29 -; GFX11-NEXT: v_readlane_b32 s79, v43, 30 -; GFX11-NEXT: v_readlane_b32 s88, v43, 31 -; GFX11-NEXT: v_readlane_b32 s89, v42, 0 -; GFX11-NEXT: v_readlane_b32 s90, v42, 1 -; GFX11-NEXT: v_readlane_b32 s91, v42, 2 -; GFX11-NEXT: v_readlane_b32 s92, v42, 3 -; GFX11-NEXT: v_readlane_b32 s47, v42, 4 -; GFX11-NEXT: v_readlane_b32 s93, v42, 5 -; GFX11-NEXT: v_readlane_b32 vcc_hi, v43, 7 -; GFX11-NEXT: v_readlane_b32 s46, v42, 6 -; GFX11-NEXT: v_readlane_b32 s31, v43, 1 -; GFX11-NEXT: v_readlane_b32 s95, v42, 7 -; GFX11-NEXT: v_readlane_b32 s45, v42, 8 -; GFX11-NEXT: v_readlane_b32 s35, v43, 5 +; GFX11-NEXT: v_dual_mov_b32 v3, s74 :: v_dual_mov_b32 v10, s90 +; GFX11-NEXT: v_dual_mov_b32 v15, s94 :: v_dual_mov_b32 v16, s30 +; GFX11-NEXT: v_readlane_b32 s90, v42, 4 +; GFX11-NEXT: v_readlane_b32 s94, v42, 2 +; GFX11-NEXT: v_readlane_b32 s91, v42, 5 +; GFX11-NEXT: v_readlane_b32 s95, v42, 3 +; GFX11-NEXT: v_readlane_b32 s30, v42, 0 +; GFX11-NEXT: v_dual_mov_b32 v53, s4 :: v_dual_mov_b32 v52, s5 +; GFX11-NEXT: v_dual_mov_b32 v51, s40 :: v_dual_mov_b32 v50, s6 +; GFX11-NEXT: v_dual_mov_b32 v48, s7 :: v_dual_mov_b32 v49, s98 +; GFX11-NEXT: v_dual_mov_b32 v39, s8 :: v_dual_mov_b32 v38, s97 +; GFX11-NEXT: v_dual_mov_b32 v37, s9 :: v_dual_mov_b32 v36, s58 +; GFX11-NEXT: v_dual_mov_b32 v35, s59 :: v_dual_mov_b32 v34, s15 +; GFX11-NEXT: v_dual_mov_b32 v33, s60 :: v_dual_mov_b32 v32, s61 +; GFX11-NEXT: v_dual_mov_b32 v31, s14 :: v_dual_mov_b32 v30, s62 +; GFX11-NEXT: v_dual_mov_b32 v28, s63 :: v_dual_mov_b32 v29, s96 +; GFX11-NEXT: v_dual_mov_b32 v27, s72 :: v_dual_mov_b32 v26, s13 +; GFX11-NEXT: v_dual_mov_b32 v25, s73 :: v_dual_mov_b32 v24, s28 +; GFX11-NEXT: v_dual_mov_b32 v22, s29 :: v_dual_mov_b32 v23, s12 +; GFX11-NEXT: v_dual_mov_b32 v17, s87 :: v_dual_mov_b32 v54, s86 +; GFX11-NEXT: v_dual_mov_b32 v7, s84 :: v_dual_mov_b32 v66, s85 +; GFX11-NEXT: v_dual_mov_b32 v11, s11 :: v_dual_mov_b32 v64, s83 +; GFX11-NEXT: v_dual_mov_b32 v65, s10 :: v_dual_mov_b32 v70, s81 +; GFX11-NEXT: v_dual_mov_b32 v55, s48 :: v_dual_mov_b32 v68, s80 +; GFX11-NEXT: v_dual_mov_b32 v69, s82 :: v_dual_mov_b32 v80, s71 +; GFX11-NEXT: v_dual_mov_b32 v67, s69 :: v_dual_mov_b32 v20, s70 +; GFX11-NEXT: v_dual_mov_b32 v71, s65 :: v_dual_mov_b32 v82, s67 +; GFX11-NEXT: v_dual_mov_b32 v21, s68 :: v_dual_mov_b32 v18, s66 +; GFX11-NEXT: v_dual_mov_b32 v81, s54 :: v_dual_mov_b32 v84, s64 +; GFX11-NEXT: v_dual_mov_b32 v19, s39 :: v_dual_mov_b32 v12, s55 +; GFX11-NEXT: v_dual_mov_b32 v83, s38 :: v_dual_mov_b32 v86, s52 +; GFX11-NEXT: v_dual_mov_b32 v13, s53 :: v_dual_mov_b32 v8, s51 +; GFX11-NEXT: v_dual_mov_b32 v85, s36 :: v_dual_mov_b32 v96, s49 +; GFX11-NEXT: v_dual_mov_b32 v9, s50 :: v_dual_mov_b32 v2, s35 +; GFX11-NEXT: v_dual_mov_b32 v1, s37 :: v_dual_mov_b32 v4, s76 +; GFX11-NEXT: v_dual_mov_b32 v87, s34 :: v_dual_mov_b32 v6, s88 +; GFX11-NEXT: v_dual_mov_b32 v5, s78 :: v_dual_mov_b32 v14, s92 +; GFX11-NEXT: s_mov_b32 s61, s41 +; GFX11-NEXT: v_readlane_b32 s62, v42, 6 +; GFX11-NEXT: v_readlane_b32 s58, v42, 7 +; GFX11-NEXT: v_readlane_b32 s59, v42, 8 +; GFX11-NEXT: v_readlane_b32 s60, v42, 9 +; GFX11-NEXT: v_readlane_b32 s63, v42, 10 +; GFX11-NEXT: v_readlane_b32 s72, v42, 11 +; GFX11-NEXT: v_readlane_b32 s73, v42, 12 +; GFX11-NEXT: v_readlane_b32 s43, v42, 13 +; GFX11-NEXT: v_readlane_b32 s41, v42, 14 +; GFX11-NEXT: v_readlane_b32 s45, v42, 15 +; GFX11-NEXT: v_readlane_b32 s40, v42, 16 +; GFX11-NEXT: v_readlane_b32 s47, v42, 17 +; GFX11-NEXT: v_readlane_b32 s29, v42, 18 +; GFX11-NEXT: v_readlane_b32 s57, v42, 19 +; GFX11-NEXT: v_readlane_b32 s28, v42, 20 +; GFX11-NEXT: v_readlane_b32 s74, v42, 21 +; GFX11-NEXT: v_readlane_b32 s15, v42, 22 +; GFX11-NEXT: v_readlane_b32 s75, v42, 23 +; GFX11-NEXT: v_readlane_b32 s14, v42, 24 +; GFX11-NEXT: v_readlane_b32 s76, v42, 25 +; GFX11-NEXT: v_readlane_b32 s11, v42, 26 +; GFX11-NEXT: v_readlane_b32 s77, v42, 27 +; GFX11-NEXT: v_readlane_b32 s10, v42, 28 +; GFX11-NEXT: v_readlane_b32 s78, v42, 29 +; GFX11-NEXT: v_readlane_b32 s13, v42, 30 +; GFX11-NEXT: v_readlane_b32 s79, v42, 31 +; GFX11-NEXT: v_readlane_b32 s12, v43, 0 +; GFX11-NEXT: v_readlane_b32 s88, v43, 1 +; GFX11-NEXT: v_readlane_b32 s9, v43, 2 +; GFX11-NEXT: v_readlane_b32 s89, v43, 3 +; GFX11-NEXT: s_mov_b32 s92, s100 +; GFX11-NEXT: v_readlane_b32 s8, v43, 4 +; GFX11-NEXT: v_readlane_b32 s91, v43, 5 +; GFX11-NEXT: v_readlane_b32 s93, v43, 6 +; GFX11-NEXT: v_readlane_b32 s95, v43, 7 +; GFX11-NEXT: s_mov_b32 vcc_lo, s102 +; GFX11-NEXT: v_readlane_b32 vcc_hi, v43, 8 +; GFX11-NEXT: v_readlane_b32 s31, v42, 1 +; GFX11-NEXT: v_readlane_b32 s7, v43, 9 ; GFX11-NEXT: .LBB91_5: ; %end ; GFX11-NEXT: s_and_b32 s0, s0, 0xff ; GFX11-NEXT: s_lshl_b32 s4, s104, 8 -; GFX11-NEXT: s_and_b32 s5, s103, 0xff -; GFX11-NEXT: s_lshl_b32 s6, s42, 8 +; GFX11-NEXT: s_and_b32 s5, s44, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s56, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s4 ; GFX11-NEXT: s_or_b32 s4, s5, s6 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_lshl_b32 s5, s102, 8 -; GFX11-NEXT: s_and_b32 s6, s58, 0xff -; GFX11-NEXT: s_lshl_b32 s7, s101, 8 +; GFX11-NEXT: s_lshl_b32 s5, s99, 8 +; GFX11-NEXT: s_and_b32 s6, s61, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s7, 8 ; GFX11-NEXT: s_or_b32 s1, s1, s5 ; GFX11-NEXT: s_or_b32 s5, s6, s7 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff @@ -169011,15 +169131,15 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_or_b32 s0, s0, s4 ; GFX11-NEXT: s_or_b32 s1, s1, s5 ; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_lshl_b32 s4, s100, 8 -; GFX11-NEXT: s_and_b32 s5, s99, 0xff -; GFX11-NEXT: s_lshl_b32 s6, s40, 8 +; GFX11-NEXT: s_lshl_b32 s4, vcc_hi, 8 +; GFX11-NEXT: s_and_b32 s5, s93, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s46, 8 ; GFX11-NEXT: s_or_b32 s2, s2, s4 ; GFX11-NEXT: s_or_b32 s4, s5, s6 ; GFX11-NEXT: s_and_b32 s3, s3, 0xff -; GFX11-NEXT: s_lshl_b32 s5, s45, 8 -; GFX11-NEXT: s_and_b32 s6, s59, 0xff -; GFX11-NEXT: s_lshl_b32 s7, s95, 8 +; GFX11-NEXT: s_lshl_b32 s5, s95, 8 +; GFX11-NEXT: s_and_b32 s6, s62, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s91, 8 ; GFX11-NEXT: s_or_b32 s3, s3, s5 ; GFX11-NEXT: s_or_b32 s5, s6, s7 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff @@ -169031,15 +169151,15 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1 ; GFX11-NEXT: v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3 ; GFX11-NEXT: s_and_b32 s0, s16, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s46, 8 -; GFX11-NEXT: s_and_b32 s2, s93, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s14, 8 +; GFX11-NEXT: s_lshl_b32 s1, s8, 8 +; GFX11-NEXT: s_and_b32 s2, s89, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s30, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s17, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s47, 8 -; GFX11-NEXT: s_and_b32 s4, s72, 0xff -; GFX11-NEXT: s_lshl_b32 s5, s92, 8 +; GFX11-NEXT: s_lshl_b32 s3, s9, 8 +; GFX11-NEXT: s_and_b32 s4, s58, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s88, 8 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s4, s5 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff @@ -169049,15 +169169,15 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s18, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s91, 8 -; GFX11-NEXT: s_and_b32 s4, s90, 0xff -; GFX11-NEXT: s_lshl_b32 s5, s12, 8 +; GFX11-NEXT: s_lshl_b32 s3, s12, 8 +; GFX11-NEXT: s_and_b32 s4, s79, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s42, 8 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s4, s5 ; GFX11-NEXT: s_and_b32 s4, s19, 0xff -; GFX11-NEXT: s_lshl_b32 s5, s89, 8 -; GFX11-NEXT: s_and_b32 s6, s60, 0xff -; GFX11-NEXT: s_lshl_b32 s7, s88, 8 +; GFX11-NEXT: s_lshl_b32 s5, s13, 8 +; GFX11-NEXT: s_and_b32 s6, s59, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s78, 8 ; GFX11-NEXT: s_or_b32 s4, s4, s5 ; GFX11-NEXT: s_or_b32 s5, s6, s7 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff @@ -169069,14 +169189,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: v_dual_mov_b32 v112, s0 :: v_dual_mov_b32 v113, s1 ; GFX11-NEXT: v_dual_mov_b32 v114, s2 :: v_dual_mov_b32 v115, s3 ; GFX11-NEXT: s_and_b32 s0, s20, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s79, 8 -; GFX11-NEXT: s_and_b32 s2, s78, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s30, 8 +; GFX11-NEXT: s_lshl_b32 s1, s10, 8 +; GFX11-NEXT: s_and_b32 s2, s77, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s94, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s21, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s77, 8 -; GFX11-NEXT: s_and_b32 s4, s61, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s11, 8 +; GFX11-NEXT: s_and_b32 s4, s60, 0xff ; GFX11-NEXT: s_lshl_b32 s5, s76, 8 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s4, s5 @@ -169087,14 +169207,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s22, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s8, 8 +; GFX11-NEXT: s_lshl_b32 s3, s14, 8 ; GFX11-NEXT: s_and_b32 s4, s75, 0xff -; GFX11-NEXT: s_lshl_b32 s5, s94, 8 +; GFX11-NEXT: s_lshl_b32 s5, s90, 8 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s4, s5 ; GFX11-NEXT: s_and_b32 s4, s23, 0xff -; GFX11-NEXT: s_lshl_b32 s5, s9, 8 -; GFX11-NEXT: s_and_b32 s6, s62, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s15, 8 +; GFX11-NEXT: s_and_b32 s6, s63, 0xff ; GFX11-NEXT: s_lshl_b32 s7, s74, 8 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 @@ -169110,32 +169230,32 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1 ; GFX11-NEXT: v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3 ; GFX11-NEXT: s_and_b32 s0, s24, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s10, 8 +; GFX11-NEXT: s_lshl_b32 s1, s28, 8 ; GFX11-NEXT: s_and_b32 s2, s57, 0xff -; GFX11-NEXT: s_lshl_b32 s4, s34, 8 +; GFX11-NEXT: s_lshl_b32 s4, vcc_lo, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s4 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_lshl_b32 s2, s11, 8 +; GFX11-NEXT: s_lshl_b32 s2, s29, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_and_b32 s1, s25, 0xff -; GFX11-NEXT: s_and_b32 s3, s63, 0xff -; GFX11-NEXT: s_lshl_b32 s4, s56, 8 +; GFX11-NEXT: s_and_b32 s3, s72, 0xff +; GFX11-NEXT: s_lshl_b32 s4, s47, 8 ; GFX11-NEXT: s_or_b32 s1, s1, s2 ; GFX11-NEXT: s_or_b32 s2, s3, s4 ; GFX11-NEXT: s_and_b32 s1, s1, 0xffff ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 ; GFX11-NEXT: s_and_b32 s3, s26, 0xff -; GFX11-NEXT: s_lshl_b32 s4, s43, 8 +; GFX11-NEXT: s_lshl_b32 s4, s40, 8 ; GFX11-NEXT: s_or_b32 s1, s1, s2 ; GFX11-NEXT: s_or_b32 s2, s3, s4 -; GFX11-NEXT: s_and_b32 s3, s41, 0xff -; GFX11-NEXT: s_lshl_b32 s4, vcc_lo, 8 -; GFX11-NEXT: s_lshl_b32 s5, s15, 8 +; GFX11-NEXT: s_and_b32 s3, s45, 0xff +; GFX11-NEXT: s_lshl_b32 s4, s92, 8 +; GFX11-NEXT: s_lshl_b32 s5, s41, 8 ; GFX11-NEXT: s_or_b32 s3, s3, s4 ; GFX11-NEXT: s_and_b32 s4, s27, 0xff -; GFX11-NEXT: s_lshl_b32 s6, s13, 8 +; GFX11-NEXT: s_lshl_b32 s6, s43, 8 ; GFX11-NEXT: s_or_b32 s4, s4, s5 ; GFX11-NEXT: s_and_b32 s5, s73, 0xff ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff @@ -169143,160 +169263,160 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 ; GFX11-NEXT: s_and_b32 s4, s4, 0xffff ; GFX11-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-NEXT: v_dual_mov_b32 v112, s0 :: v_dual_and_b32 v23, 0xff, v23 -; GFX11-NEXT: v_dual_mov_b32 v113, s1 :: v_dual_lshlrev_b32 v6, 8, v6 +; GFX11-NEXT: v_dual_mov_b32 v113, s1 :: v_dual_and_b32 v24, 0xff, v24 +; GFX11-NEXT: v_dual_mov_b32 v112, s0 :: v_dual_lshlrev_b32 v1, 8, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v114, s2 :: v_dual_lshlrev_b32 v11, 8, v11 ; GFX11-NEXT: s_or_b32 s3, s4, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_dual_mov_b32 v115, s3 :: v_dual_and_b32 v96, 0xff, v96 -; GFX11-NEXT: v_or_b32_e32 v6, v23, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GFX11-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v11, v96, v11 -; GFX11-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GFX11-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX11-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX11-NEXT: v_or_b32_e32 v1, v24, v1 +; GFX11-NEXT: v_mov_b32_e32 v114, s2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-NEXT: v_or_b32_e32 v16, v96, v16 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GFX11-NEXT: v_or_b32_e32 v23, v6, v11 -; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v21 -; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v22 -; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v87 -; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v26 -; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v86 -; GFX11-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GFX11-NEXT: v_or_b32_e32 v7, v11, v21 -; GFX11-NEXT: v_or_b32_e32 v11, v22, v13 -; GFX11-NEXT: v_or_b32_e32 v10, v26, v10 -; GFX11-NEXT: v_or_b32_e32 v13, v24, v14 -; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v25 -; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v85 -; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v29 -; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v84 -; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v27 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX11-NEXT: v_or_b32_e32 v9, v25, v9 +; GFX11-NEXT: v_or_b32_e32 v24, v1, v16 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v22 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 8, v87 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v27 +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v86 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v84 +; GFX11-NEXT: v_or_b32_e32 v2, v16, v22 +; GFX11-NEXT: v_or_b32_e32 v8, v23, v8 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v26 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 8, v85 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v30 +; GFX11-NEXT: v_or_b32_e32 v15, v27, v15 ; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v28 -; GFX11-NEXT: v_lshlrev_b32_e32 v27, 8, v83 -; GFX11-NEXT: v_or_b32_e32 v14, v14, v21 -; GFX11-NEXT: v_or_b32_e32 v15, v22, v15 -; GFX11-NEXT: v_or_b32_e32 v9, v24, v9 -; GFX11-NEXT: v_or_b32_e32 v16, v25, v16 -; GFX11-NEXT: v_or_b32_e32 v21, v26, v27 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 8, v83 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v22 +; GFX11-NEXT: v_or_b32_e32 v12, v23, v12 +; GFX11-NEXT: v_or_b32_e32 v14, v25, v14 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_or_b32_e32 v13, v26, v13 +; GFX11-NEXT: v_or_b32_e32 v22, v27, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-NEXT: v_or_b32_e32 v24, v6, v7 -; GFX11-NEXT: v_or_b32_e32 v25, v11, v10 -; GFX11-NEXT: v_or_b32_e32 v26, v13, v14 -; GFX11-NEXT: v_or_b32_e32 v6, v15, v9 -; GFX11-NEXT: v_or_b32_e32 v7, v16, v21 -; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v32 -; GFX11-NEXT: v_lshlrev_b32_e32 v10, 8, v17 -; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v82 -; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v31 -; GFX11-NEXT: v_lshlrev_b32_e32 v14, 8, v18 -; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v30 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_or_b32_e32 v25, v1, v2 +; GFX11-NEXT: v_or_b32_e32 v26, v8, v15 +; GFX11-NEXT: v_or_b32_e32 v27, v9, v16 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v14 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v33 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v18 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v82 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 8, v10 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v32 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 8, v19 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v31 ; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v81 -; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v35 -; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v19 -; GFX11-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX11-NEXT: v_or_b32_e32 v8, v11, v8 -; GFX11-NEXT: v_or_b32_e32 v10, v13, v14 -; GFX11-NEXT: v_or_b32_e32 v11, v15, v16 -; GFX11-NEXT: v_or_b32_e32 v13, v17, v18 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v36 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v20 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v22 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v8, v10, v14 +; GFX11-NEXT: v_or_b32_e32 v9, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v10, v18, v19 ; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v80 -; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v34 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v20 -; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v33 -; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v71 -; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v70 -; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v69 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX11-NEXT: v_or_b32_e32 v4, v14, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v35 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v21 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v34 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v71 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v39 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v70 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v69 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-NEXT: v_or_b32_e32 v6, v14, v6 ; GFX11-NEXT: v_or_b32_e32 v14, v15, v16 -; GFX11-NEXT: v_or_b32_e32 v15, v17, v18 -; GFX11-NEXT: v_or_b32_e32 v16, v19, v20 -; GFX11-NEXT: v_or_b32_e32 v3, v21, v3 -; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_or_b32_e32 v15, v18, v19 +; GFX11-NEXT: v_or_b32_e32 v16, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v5, v22, v5 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v15 ; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_or_b32_e32 v8, v9, v8 -; GFX11-NEXT: v_or_b32_e32 v9, v10, v11 -; GFX11-NEXT: v_or_b32_e32 v13, v13, v4 -; GFX11-NEXT: v_or_b32_e32 v14, v14, v15 -; GFX11-NEXT: v_or_b32_e32 v15, v16, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v36 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 8, v68 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v37 -; GFX11-NEXT: v_lshlrev_b32_e32 v11, 8, v67 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_or_b32_e32 v14, v1, v2 +; GFX11-NEXT: v_or_b32_e32 v15, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v8, v10, v6 +; GFX11-NEXT: v_or_b32_e32 v9, v18, v19 +; GFX11-NEXT: v_or_b32_e32 v10, v16, v5 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v37 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v68 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v38 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 8, v67 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v50 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v66 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v65 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v48 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v64 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v5, v16, v18 +; GFX11-NEXT: v_or_b32_e32 v4, v19, v4 +; GFX11-NEXT: v_or_b32_e32 v6, v20, v21 ; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v49 -; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v66 -; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v65 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v39 -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v64 -; GFX11-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v4, v10, v11 -; GFX11-NEXT: v_or_b32_e32 v10, v16, v17 -; GFX11-NEXT: v_or_b32_e32 v2, v18, v2 -; GFX11-NEXT: v_or_b32_e32 v11, v19, v20 -; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v48 -; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v55 -; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v52 -; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v54 -; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v53 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v51 -; GFX11-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v50 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX11-NEXT: v_or_b32_e32 v16, v16, v17 -; GFX11-NEXT: v_or_b32_e32 v17, v18, v19 -; GFX11-NEXT: v_or_b32_e32 v1, v20, v1 -; GFX11-NEXT: v_or_b32_e32 v12, v21, v12 -; GFX11-NEXT: v_or_b32_e32 v5, v22, v5 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v55 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v53 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v54 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v52 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v51 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v18 +; GFX11-NEXT: v_or_b32_e32 v18, v19, v20 +; GFX11-NEXT: v_or_b32_e32 v3, v17, v3 +; GFX11-NEXT: v_or_b32_e32 v11, v21, v11 +; GFX11-NEXT: v_or_b32_e32 v7, v22, v7 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v16 -; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_or_b32_e32 v16, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v1, v10, v2 -; GFX11-NEXT: v_or_b32_e32 v2, v11, v18 -; GFX11-NEXT: v_or_b32_e32 v3, v17, v19 -; GFX11-NEXT: v_or_b32_e32 v4, v12, v5 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_or_b32_e32 v11, v1, v2 +; GFX11-NEXT: v_or_b32_e32 v1, v5, v4 +; GFX11-NEXT: v_or_b32_e32 v2, v6, v16 +; GFX11-NEXT: v_or_b32_e32 v3, v17, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v18, v7 ; GFX11-NEXT: s_clause 0x5 ; GFX11-NEXT: scratch_store_b128 v0, v[97:100], off offset:32 ; GFX11-NEXT: scratch_store_b128 v0, v[112:115], off offset:48 -; GFX11-NEXT: scratch_store_b128 v0, v[23:26], off offset:64 -; GFX11-NEXT: scratch_store_b128 v0, v[6:9], off offset:80 -; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:96 +; GFX11-NEXT: scratch_store_b128 v0, v[24:27], off offset:64 +; GFX11-NEXT: scratch_store_b128 v0, v[12:15], off offset:80 +; GFX11-NEXT: scratch_store_b128 v0, v[8:11], off offset:96 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 ; GFX11-NEXT: v_readlane_b32 s104, v41, 8 ; GFX11-NEXT: v_readlane_b32 s103, v41, 7 @@ -169385,39 +169505,40 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v44, v19 -; SI-NEXT: v_mov_b32_e32 v43, v17 -; SI-NEXT: v_mov_b32_e32 v32, v14 -; SI-NEXT: v_mov_b32_e32 v14, v12 -; SI-NEXT: v_mov_b32_e32 v12, v10 -; SI-NEXT: v_mov_b32_e32 v41, v7 -; SI-NEXT: v_mov_b32_e32 v55, v5 -; SI-NEXT: v_mov_b32_e32 v54, v3 -; SI-NEXT: v_mov_b32_e32 v51, v1 -; SI-NEXT: v_mov_b32_e32 v10, v0 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:392 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v51, v3 +; SI-NEXT: v_mov_b32_e32 v49, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:392 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:148 @@ -169426,129 +169547,135 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:172 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:180 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:128 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v18 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v24 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v28 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v30 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 ; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:176 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:144 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 @@ -169559,27 +169686,27 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 @@ -169587,82 +169714,78 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v3 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 @@ -169678,79 +169801,64 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:372 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:352 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:360 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:368 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v1 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v2 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:384 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v3 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:360 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:368 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:384 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:184 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:216 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:248 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:280 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:312 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:344 @@ -169763,686 +169871,265 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB92_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v51 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 -; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 -; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 -; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; SI-NEXT: v_and_b32_e32 v34, 0xff, v34 -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 -; SI-NEXT: v_and_b32_e32 v39, 0xff, v50 -; SI-NEXT: v_and_b32_e32 v48, 0xff, v40 -; SI-NEXT: v_and_b32_e32 v49, 0xff, v49 -; SI-NEXT: v_and_b32_e32 v52, 0xff, v52 -; SI-NEXT: v_and_b32_e32 v42, 0xff, v42 -; SI-NEXT: v_and_b32_e32 v46, 0xff, v46 -; SI-NEXT: v_or_b32_e32 v45, v46, v45 -; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 -; SI-NEXT: v_or_b32_e32 v56, v56, v61 -; SI-NEXT: v_and_b32_e32 v57, 0xff, v57 -; SI-NEXT: v_and_b32_e32 v47, 0xff, v47 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_or_b32_e32 v3, v47, v3 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v54 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v2, v6 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v2, v8 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v2, v12 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v2, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v13 -; SI-NEXT: v_mov_b32_e32 v8, v7 -; SI-NEXT: v_mov_b32_e32 v7, v19 -; SI-NEXT: v_or_b32_e32 v19, v2, v32 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v15 -; SI-NEXT: v_and_b32_e32 v35, 0xff, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v17, v2, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v23, v2, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v31, v2, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v21 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v51, v2, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v51 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v27, v2, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v29, v2, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v29 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v5, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v4, v4, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v33, v6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB92_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v7, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v8, v8, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v49 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v34, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v36, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v50, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v52, 0xff, v52 +; SI-NEXT: v_and_b32_e32 v42, 0xff, v42 +; SI-NEXT: v_or_b32_e32 v54, v42, v54 +; SI-NEXT: v_and_b32_e32 v53, 0xff, v53 +; SI-NEXT: v_and_b32_e32 v41, 0xff, v41 +; SI-NEXT: v_or_b32_e32 v27, v41, v27 +; SI-NEXT: v_and_b32_e32 v41, 0xff, v57 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_and_b32_e32 v41, 0xff, v47 +; SI-NEXT: v_or_b32_e32 v3, v41, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v63 +; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 +; SI-NEXT: v_and_b32_e32 v55, 0xff, v55 +; SI-NEXT: v_or_b32_e32 v56, v56, v61 +; SI-NEXT: v_or_b32_e32 v55, v55, v62 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v2, v2, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v12, v12, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v51 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v2, v2, v6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v14, v14, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v32, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v32, v32, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v18, v18, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v22, v22, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v24, v24, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v26, v26, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v28, v28, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v30, v30, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v34, v34, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v16, v16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v20, v20, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v35, v35, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v12 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v36, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v36, v36, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v37, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v37, v37, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v38, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v38, v38, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v39, v39, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v7, v7, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v48, v48, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v49, v49, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v2, v6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v50, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v50, v50, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v2, v6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v9, v9, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v23, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v54, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v54, v54, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v52, v52, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v7, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v11, v11, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v31, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v7, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v53, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v53, v53, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v49, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v55, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v55, v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v13, v13, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v41, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v41, v41, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v40, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v40, v40, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v48, v40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v42, v42, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v38, v42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v43, 0xff, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v43, v43, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v25, v2, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_or_b32_e32 v15, v15, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v36, v15 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v44, 0xff, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v44, v44, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v46, 0xff, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v46, v46, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v46 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v58, 0xff, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v58, v58, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v58 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v59, 0xff, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v59, v59, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v59 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v60, 0xff, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v60, v60, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v60 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v61, 0xff, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v25, v61, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v61, 0xff, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v61, v61, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 @@ -170516,68 +170203,483 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; SI-NEXT: ; kill: killed $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v4, v2, v4 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v29, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v32, v6, v8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v11, v6, v8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v33, v6, v8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v18, v18, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v22, v22, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v24, v24, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v26, v26, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v28, v28, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v30, v30, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v34, v34, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v16, v16, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v20, v20, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v13, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v36, v36, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v15 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xff, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v15, v15, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v19 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v14, v14, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v50, v50, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v35, 0xff, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v35, v35, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v54 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v52, v52, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v37, 0xff, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v37, v37, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v38, 0xff, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v38, v38, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v39, 0xff, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v39, v39, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v53, v53, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v48, 0xff, v48 +; SI-NEXT: v_or_b32_e32 v48, v48, v51 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v38, v48 +; SI-NEXT: v_mov_b32_e32 v48, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v51, 0xff, v51 +; SI-NEXT: v_or_b32_e32 v51, v51, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v42, 0xff, v42 +; SI-NEXT: v_or_b32_e32 v40, v42, v40 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v42, 0xff, v42 +; SI-NEXT: v_or_b32_e32 v42, v42, v43 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v32, v42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v43, v43, v44 +; SI-NEXT: v_and_b32_e32 v44, 0xff, v46 +; SI-NEXT: v_or_b32_e32 v44, v44, v45 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v39, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_and_b32_e32 v45, 0xff, v45 +; SI-NEXT: v_or_b32_e32 v45, v45, v58 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v37, v45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v46, 0xff, v46 +; SI-NEXT: v_or_b32_e32 v46, v46, v59 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v58, 0xff, v58 +; SI-NEXT: v_or_b32_e32 v58, v58, v60 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v33, v58 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v46 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: .LBB92_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB92_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v5, v3, v2 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v9, v3, v2 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v11, v63, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v13, v27, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v21, v62, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v23, v61, v2 ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; SI-NEXT: v_and_b32_e32 v34, 0xff, v34 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 @@ -170595,791 +170697,737 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_mov_b32_e32 v17, v43 -; SI-NEXT: v_mov_b32_e32 v19, v44 -; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v47, 0xff, v47 -; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v15 -; SI-NEXT: v_and_b32_e32 v57, 0xff, v57 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v51 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v63, 0xff, v63 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v23, v63, v2 -; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v54 -; SI-NEXT: v_and_b32_e32 v63, 0xff, v63 +; SI-NEXT: v_or_b32_e32 v25, v60, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v25, v25, v2 +; SI-NEXT: v_or_b32_e32 v27, v59, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v31, v62, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v56 +; SI-NEXT: v_or_b32_e32 v31, v58, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v46 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v33, v61, v2 -; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v17 -; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 -; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v41 -; SI-NEXT: v_and_b32_e32 v61, 0xff, v61 -; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v55 -; SI-NEXT: v_and_b32_e32 v62, 0xff, v62 +; SI-NEXT: v_or_b32_e32 v32, v45, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v35, v60, v2 -; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v9 -; SI-NEXT: v_and_b32_e32 v60, 0xff, v60 -; SI-NEXT: v_or_b32_e32 v12, v12, v60 -; SI-NEXT: v_add_i32_e32 v60, vcc, s6, v25 +; SI-NEXT: v_or_b32_e32 v33, v44, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v37, v59, v2 -; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v11 -; SI-NEXT: v_and_b32_e32 v59, 0xff, v59 -; SI-NEXT: v_or_b32_e32 v14, v14, v59 -; SI-NEXT: v_add_i32_e32 v59, vcc, s6, v31 +; SI-NEXT: v_or_b32_e32 v35, v43, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v38, v58, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v46 +; SI-NEXT: v_or_b32_e32 v37, v40, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v39, v45, v2 -; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v46, 0xff, v46 -; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v13 -; SI-NEXT: v_and_b32_e32 v58, 0xff, v58 -; SI-NEXT: v_or_b32_e32 v32, v32, v58 -; SI-NEXT: v_add_i32_e32 v58, vcc, s6, v33 -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v32 +; SI-NEXT: v_or_b32_e32 v38, v54, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v48, v0, v2 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v39, v0, v2 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v48, v0, v2 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v54, v0, v2 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v55, v0, v2 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v41, v0, v2 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v52 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v50, v0, v2 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v19 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v0, v2 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v49 +; SI-NEXT: v_or_b32_e32 v17, v0, v2 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v15 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v40 +; SI-NEXT: v_or_b32_e32 v15, v0, v2 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v0, v2 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v0, v2 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v20, v0, v20 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v16, v0, v16 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v0, v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v0, v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v16 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v36, 0xff, v36 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v49, 0xff, v49 +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v34, 0xff, v34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v0, v0, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v50, 0xff, v50 +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v36, 0xff, v36 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v0, v0, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v52, 0xff, v52 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v54 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v53, 0xff, v53 -; SI-NEXT: v_or_b32_e32 v7, v7, v53 -; SI-NEXT: v_add_i32_e32 v54, vcc, s6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v14 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v7, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v48 +; SI-NEXT: v_add_i32_e32 v48, vcc, s6, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v40, 0xff, v40 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v40, vcc, s6, v39 +; SI-NEXT: v_add_i32_e32 v39, vcc, s6, v41 +; SI-NEXT: v_add_i32_e32 v41, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v42, 0xff, v42 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v54, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v38 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v51, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v43, vcc, s6, v37 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v44, 0xff, v44 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v44, vcc, s6, v35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v45, 0xff, v45 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v45, vcc, s6, v48 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v45, vcc, s6, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v46, 0xff, v46 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v46, vcc, s6, v39 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v46, vcc, s6, v32 +; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v50 +; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v47, 0xff, v47 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v47, vcc, s6, v38 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v47, vcc, s6, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v56, vcc, s6, v37 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v56, vcc, s6, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v57, 0xff, v57 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v8, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v6, v62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v57, vcc, s6, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v57, vcc, s6, v35 -; SI-NEXT: v_add_i32_e32 v61, vcc, s6, v23 -; SI-NEXT: v_add_i32_e32 v62, vcc, s6, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v63, v0, v63 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v58, 0xff, v58 +; SI-NEXT: v_or_b32_e32 v0, v14, v58 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v58, vcc, s6, v23 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v59, 0xff, v59 +; SI-NEXT: v_or_b32_e32 v0, v12, v59 +; SI-NEXT: v_add_i32_e32 v59, vcc, s6, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v41, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v51, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v44, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v60, 0xff, v60 +; SI-NEXT: v_or_b32_e32 v0, v10, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v60, vcc, s6, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v61, 0xff, v61 +; SI-NEXT: v_or_b32_e32 v0, v8, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v61, vcc, s6, v11 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v43, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v40, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v62, 0xff, v62 +; SI-NEXT: v_or_b32_e32 v0, v6, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v62, vcc, s6, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v63, v0, v63 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v48, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v39, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v62 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v19 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v56 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v59 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v27 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v61 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v49 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v51 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v54 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v55 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v37, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v36, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v35, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v29, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v33, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v59 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v35, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v37, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v53 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v7 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v60 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v58 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v61 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v12 -; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v63 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v14 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v32 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v63, vcc, s6, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 ; SI-NEXT: .LBB92_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v10 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v10, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v0, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v10 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v5 ; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v10 +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v5 ; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v10 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -171388,9 +171436,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v10 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -171399,9 +171447,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v10 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -171410,9 +171458,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v10 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -171421,9 +171469,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v10 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -171432,9 +171480,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v10 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -171443,9 +171491,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v10 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -171454,9 +171502,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v10 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -171465,9 +171513,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v10 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -171476,9 +171524,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v10 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -171487,9 +171535,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v10 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -171498,9 +171546,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v10 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -171509,9 +171557,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v10 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -171520,9 +171568,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v10 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -171531,9 +171579,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v10 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -171542,9 +171590,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v10 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -171553,9 +171601,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v10 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -171564,9 +171612,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v10 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -171575,9 +171623,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v10 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -171586,9 +171634,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v10 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -171597,27 +171645,27 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v10 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v10 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v10 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -171625,36 +171673,43 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v5 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v10 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v5 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v10 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v5 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v5 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x7c, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v5 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x7c, v5 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen @@ -175870,11 +175925,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 ; SI-NEXT: ; implicit-def: $vgpr61 : SGPR spill to VGPR lane ; SI-NEXT: s_mov_b32 s10, s16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_writelane_b32 v61, s29, 0 -; SI-NEXT: v_writelane_b32 v61, s28, 1 -; SI-NEXT: v_writelane_b32 v61, s27, 2 -; SI-NEXT: s_mov_b32 s61, s21 +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_writelane_b32 v63, s30, 0 ; SI-NEXT: v_writelane_b32 v63, s31, 1 ; SI-NEXT: v_writelane_b32 v63, s34, 2 @@ -175909,59 +175960,59 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_writelane_b32 v63, s87, 31 ; SI-NEXT: v_writelane_b32 v63, s96, 32 ; SI-NEXT: v_writelane_b32 v63, s97, 33 -; SI-NEXT: s_mov_b32 s67, s19 -; SI-NEXT: s_mov_b32 s54, s17 -; SI-NEXT: s_mov_b32 s35, s23 -; SI-NEXT: s_mov_b32 s39, s26 -; SI-NEXT: s_mov_b32 s62, s25 +; SI-NEXT: s_mov_b32 s54, s27 +; SI-NEXT: s_mov_b32 s79, s29 +; SI-NEXT: s_mov_b32 s66, s26 +; SI-NEXT: s_mov_b32 s64, s23 +; SI-NEXT: s_mov_b32 s65, s19 +; SI-NEXT: s_mov_b32 s67, s17 ; SI-NEXT: v_writelane_b32 v63, s98, 34 ; SI-NEXT: v_writelane_b32 v63, s99, 35 -; SI-NEXT: v_readfirstlane_b32 s99, v1 -; SI-NEXT: v_readfirstlane_b32 s74, v24 +; SI-NEXT: s_mov_b32 s92, s24 +; SI-NEXT: v_readfirstlane_b32 s31, v1 +; SI-NEXT: v_readfirstlane_b32 s81, v23 ; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane -; SI-NEXT: v_readfirstlane_b32 s6, v23 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v62, s74, 0 ; SI-NEXT: v_readfirstlane_b32 s12, v26 -; SI-NEXT: v_writelane_b32 v62, s6, 1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v62, s81, 0 ; SI-NEXT: v_readfirstlane_b32 s14, v25 -; SI-NEXT: v_writelane_b32 v62, s12, 2 +; SI-NEXT: v_writelane_b32 v62, s12, 1 ; SI-NEXT: v_readfirstlane_b32 s46, v28 -; SI-NEXT: v_writelane_b32 v62, s14, 3 -; SI-NEXT: v_readfirstlane_b32 s56, v27 -; SI-NEXT: v_writelane_b32 v62, s46, 4 -; SI-NEXT: v_readfirstlane_b32 s57, v30 -; SI-NEXT: v_writelane_b32 v62, s56, 5 -; SI-NEXT: v_readfirstlane_b32 s59, v29 -; SI-NEXT: v_writelane_b32 v62, s57, 6 -; SI-NEXT: v_writelane_b32 v62, s59, 7 -; SI-NEXT: s_mov_b32 s60, s20 -; SI-NEXT: s_mov_b32 s63, s24 -; SI-NEXT: v_readfirstlane_b32 s95, v3 -; SI-NEXT: v_readfirstlane_b32 s31, v5 -; SI-NEXT: v_readfirstlane_b32 s24, v9 -; SI-NEXT: v_readfirstlane_b32 s38, v12 +; SI-NEXT: v_writelane_b32 v62, s14, 2 +; SI-NEXT: v_readfirstlane_b32 s57, v27 +; SI-NEXT: v_writelane_b32 v62, s46, 3 +; SI-NEXT: v_readfirstlane_b32 s58, v30 +; SI-NEXT: v_writelane_b32 v62, s57, 4 +; SI-NEXT: s_mov_b32 s77, s25 +; SI-NEXT: v_readfirstlane_b32 s25, v29 +; SI-NEXT: v_writelane_b32 v62, s58, 5 +; SI-NEXT: v_writelane_b32 v62, s25, 6 +; SI-NEXT: v_readfirstlane_b32 s55, v3 +; SI-NEXT: v_readfirstlane_b32 s80, v5 +; SI-NEXT: v_readfirstlane_b32 s51, v7 ; SI-NEXT: v_readfirstlane_b32 s36, v11 -; SI-NEXT: v_readfirstlane_b32 s8, v14 -; SI-NEXT: v_readfirstlane_b32 s27, v13 -; SI-NEXT: v_readfirstlane_b32 s9, v16 -; SI-NEXT: v_readfirstlane_b32 s79, v15 +; SI-NEXT: v_readfirstlane_b32 s87, v16 +; SI-NEXT: v_readfirstlane_b32 s84, v15 ; SI-NEXT: v_readfirstlane_b32 s13, v18 ; SI-NEXT: v_readfirstlane_b32 s15, v17 ; SI-NEXT: v_readfirstlane_b32 s42, v20 ; SI-NEXT: v_readfirstlane_b32 s43, v19 ; SI-NEXT: v_readfirstlane_b32 s44, v22 +; SI-NEXT: v_readfirstlane_b32 s73, v21 +; SI-NEXT: v_readfirstlane_b32 s74, v24 +; SI-NEXT: v_readfirstlane_b32 s62, v14 +; SI-NEXT: v_readfirstlane_b32 s9, v13 +; SI-NEXT: v_readfirstlane_b32 s63, v12 +; SI-NEXT: v_readfirstlane_b32 s61, v10 +; SI-NEXT: v_readfirstlane_b32 s94, v9 +; SI-NEXT: v_readfirstlane_b32 s60, v8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:328 -; SI-NEXT: v_writelane_b32 v61, s4, 3 -; SI-NEXT: v_readfirstlane_b32 s45, v21 -; SI-NEXT: v_readfirstlane_b32 s98, v10 -; SI-NEXT: v_readfirstlane_b32 s90, v8 -; SI-NEXT: v_readfirstlane_b32 s88, v7 -; SI-NEXT: v_readfirstlane_b32 s91, v6 -; SI-NEXT: v_readfirstlane_b32 s93, v4 -; SI-NEXT: v_readfirstlane_b32 s55, v2 +; SI-NEXT: v_writelane_b32 v61, s4, 0 +; SI-NEXT: v_readfirstlane_b32 s35, v6 +; SI-NEXT: v_readfirstlane_b32 s91, v4 +; SI-NEXT: v_readfirstlane_b32 s95, v2 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill @@ -175979,142 +176030,142 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:324 -; SI-NEXT: v_writelane_b32 v61, s4, 4 +; SI-NEXT: v_writelane_b32 v61, s4, 1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:320 -; SI-NEXT: v_writelane_b32 v61, s4, 5 +; SI-NEXT: v_writelane_b32 v61, s4, 2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:316 -; SI-NEXT: v_writelane_b32 v61, s4, 6 +; SI-NEXT: v_writelane_b32 v61, s4, 3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:312 -; SI-NEXT: v_writelane_b32 v61, s4, 7 +; SI-NEXT: v_writelane_b32 v61, s4, 4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:308 -; SI-NEXT: v_writelane_b32 v61, s4, 8 +; SI-NEXT: v_writelane_b32 v61, s4, 5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:304 -; SI-NEXT: v_writelane_b32 v61, s4, 9 +; SI-NEXT: v_writelane_b32 v61, s4, 6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 -; SI-NEXT: v_writelane_b32 v61, s4, 10 +; SI-NEXT: v_writelane_b32 v61, s4, 7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:296 -; SI-NEXT: v_writelane_b32 v61, s4, 11 +; SI-NEXT: v_writelane_b32 v61, s4, 8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 -; SI-NEXT: v_writelane_b32 v61, s4, 12 +; SI-NEXT: v_writelane_b32 v61, s4, 9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:288 -; SI-NEXT: v_writelane_b32 v61, s4, 13 +; SI-NEXT: v_writelane_b32 v61, s4, 10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:284 -; SI-NEXT: v_writelane_b32 v61, s4, 14 +; SI-NEXT: v_writelane_b32 v61, s4, 11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:280 -; SI-NEXT: v_writelane_b32 v61, s4, 15 +; SI-NEXT: v_writelane_b32 v61, s4, 12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 -; SI-NEXT: v_writelane_b32 v61, s4, 16 +; SI-NEXT: v_writelane_b32 v61, s4, 13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 -; SI-NEXT: v_writelane_b32 v61, s4, 17 +; SI-NEXT: v_writelane_b32 v61, s4, 14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:268 -; SI-NEXT: v_writelane_b32 v61, s4, 18 +; SI-NEXT: v_writelane_b32 v61, s4, 15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 -; SI-NEXT: v_writelane_b32 v61, s4, 19 +; SI-NEXT: v_writelane_b32 v61, s4, 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:260 -; SI-NEXT: v_writelane_b32 v61, s4, 20 +; SI-NEXT: v_writelane_b32 v61, s4, 17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:256 -; SI-NEXT: v_writelane_b32 v61, s4, 21 +; SI-NEXT: v_writelane_b32 v61, s4, 18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252 -; SI-NEXT: v_writelane_b32 v61, s4, 22 +; SI-NEXT: v_writelane_b32 v61, s4, 19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:248 -; SI-NEXT: v_writelane_b32 v61, s4, 23 +; SI-NEXT: v_writelane_b32 v61, s4, 20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:244 -; SI-NEXT: v_writelane_b32 v61, s4, 24 +; SI-NEXT: v_writelane_b32 v61, s4, 21 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:240 -; SI-NEXT: v_writelane_b32 v61, s4, 25 +; SI-NEXT: v_writelane_b32 v61, s4, 22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:236 -; SI-NEXT: v_writelane_b32 v61, s4, 26 +; SI-NEXT: v_writelane_b32 v61, s4, 23 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:232 -; SI-NEXT: v_writelane_b32 v61, s4, 27 +; SI-NEXT: v_writelane_b32 v61, s4, 24 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:228 -; SI-NEXT: v_writelane_b32 v61, s4, 28 +; SI-NEXT: v_writelane_b32 v61, s4, 25 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 -; SI-NEXT: v_writelane_b32 v61, s4, 29 +; SI-NEXT: v_writelane_b32 v61, s4, 26 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:220 -; SI-NEXT: v_writelane_b32 v61, s4, 30 +; SI-NEXT: v_writelane_b32 v61, s4, 27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s16, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:216 -; SI-NEXT: v_writelane_b32 v61, s4, 31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 -; SI-NEXT: v_writelane_b32 v61, s4, 32 +; SI-NEXT: v_writelane_b32 v61, s4, 28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s16, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:208 +; SI-NEXT: v_writelane_b32 v61, s4, 29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 -; SI-NEXT: v_writelane_b32 v61, s4, 33 +; SI-NEXT: v_writelane_b32 v61, s4, 30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s89, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 -; SI-NEXT: v_writelane_b32 v61, s4, 34 +; SI-NEXT: v_writelane_b32 v61, s4, 31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s73, v31 +; SI-NEXT: v_readfirstlane_b32 s93, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:188 -; SI-NEXT: v_writelane_b32 v61, s4, 35 +; SI-NEXT: v_writelane_b32 v61, s4, 32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s72, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 @@ -176122,270 +176173,265 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_readfirstlane_b32 s40, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s21, v31 +; SI-NEXT: v_readfirstlane_b32 s97, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:176 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s85, v31 +; SI-NEXT: v_readfirstlane_b32 s45, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s81, v31 +; SI-NEXT: v_readfirstlane_b32 s85, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:168 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s97, v31 +; SI-NEXT: v_readfirstlane_b32 s11, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:164 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s7, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160 +; SI-NEXT: v_writelane_b32 v61, s4, 33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s11, v31 +; SI-NEXT: v_readfirstlane_b32 s7, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s41, v31 +; SI-NEXT: v_readfirstlane_b32 s47, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s47, v31 +; SI-NEXT: v_readfirstlane_b32 s41, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s58, v31 +; SI-NEXT: v_readfirstlane_b32 s59, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s76, v31 +; SI-NEXT: v_readfirstlane_b32 s56, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s29, v31 +; SI-NEXT: v_readfirstlane_b32 s78, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s27, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 -; SI-NEXT: v_writelane_b32 v61, s4, 36 -; SI-NEXT: v_writelane_b32 v61, s54, 37 -; SI-NEXT: v_writelane_b32 v61, s10, 38 -; SI-NEXT: v_writelane_b32 v61, s67, 39 -; SI-NEXT: v_writelane_b32 v61, s18, 40 -; SI-NEXT: v_writelane_b32 v61, s61, 41 -; SI-NEXT: v_writelane_b32 v61, s60, 42 -; SI-NEXT: v_writelane_b32 v61, s35, 43 -; SI-NEXT: v_writelane_b32 v61, s22, 44 -; SI-NEXT: v_writelane_b32 v61, s62, 45 -; SI-NEXT: v_writelane_b32 v61, s63, 46 -; SI-NEXT: v_writelane_b32 v61, s39, 47 -; SI-NEXT: v_writelane_b32 v61, s99, 48 -; SI-NEXT: v_writelane_b32 v61, s95, 49 -; SI-NEXT: v_writelane_b32 v61, s31, 50 -; SI-NEXT: v_writelane_b32 v61, s24, 51 -; SI-NEXT: v_writelane_b32 v61, s38, 52 -; SI-NEXT: v_writelane_b32 v61, s36, 53 -; SI-NEXT: v_writelane_b32 v61, s8, 54 -; SI-NEXT: v_writelane_b32 v61, s27, 55 -; SI-NEXT: v_writelane_b32 v61, s9, 56 -; SI-NEXT: v_writelane_b32 v61, s79, 57 -; SI-NEXT: v_writelane_b32 v61, s13, 58 -; SI-NEXT: v_writelane_b32 v61, s15, 59 -; SI-NEXT: v_writelane_b32 v61, s42, 60 -; SI-NEXT: v_writelane_b32 v61, s43, 61 -; SI-NEXT: v_writelane_b32 v61, s44, 62 -; SI-NEXT: v_writelane_b32 v61, s45, 63 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s37, v31 +; SI-NEXT: v_readfirstlane_b32 s39, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s50, v31 +; SI-NEXT: v_readfirstlane_b32 s53, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s48, v31 +; SI-NEXT: v_readfirstlane_b32 s50, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s19, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 +; SI-NEXT: v_writelane_b32 v61, s4, 34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s64, v31 +; SI-NEXT: v_readfirstlane_b32 s29, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s17, v31 +; SI-NEXT: v_readfirstlane_b32 s26, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s65, v31 +; SI-NEXT: v_readfirstlane_b32 s23, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s71, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; SI-NEXT: v_writelane_b32 v61, s4, 35 +; SI-NEXT: v_writelane_b32 v61, s67, 36 +; SI-NEXT: v_writelane_b32 v61, s10, 37 +; SI-NEXT: v_writelane_b32 v61, s65, 38 +; SI-NEXT: v_writelane_b32 v61, s18, 39 +; SI-NEXT: v_writelane_b32 v61, s21, 40 +; SI-NEXT: v_writelane_b32 v61, s20, 41 +; SI-NEXT: v_writelane_b32 v61, s64, 42 +; SI-NEXT: v_writelane_b32 v61, s22, 43 +; SI-NEXT: v_writelane_b32 v61, s77, 44 +; SI-NEXT: v_writelane_b32 v61, s92, 45 +; SI-NEXT: v_writelane_b32 v61, s54, 46 +; SI-NEXT: v_writelane_b32 v61, s66, 47 +; SI-NEXT: v_writelane_b32 v61, s79, 48 +; SI-NEXT: v_writelane_b32 v61, s31, 49 +; SI-NEXT: v_writelane_b32 v61, s28, 50 +; SI-NEXT: v_writelane_b32 v61, s55, 51 +; SI-NEXT: v_writelane_b32 v61, s80, 52 +; SI-NEXT: v_writelane_b32 v61, s51, 53 +; SI-NEXT: v_writelane_b32 v61, s36, 54 +; SI-NEXT: v_writelane_b32 v61, s87, 55 +; SI-NEXT: v_writelane_b32 v61, s84, 56 +; SI-NEXT: v_writelane_b32 v61, s13, 57 +; SI-NEXT: v_writelane_b32 v61, s15, 58 +; SI-NEXT: v_writelane_b32 v61, s42, 59 +; SI-NEXT: v_writelane_b32 v61, s43, 60 +; SI-NEXT: v_writelane_b32 v61, s44, 61 +; SI-NEXT: v_writelane_b32 v61, s73, 62 +; SI-NEXT: v_writelane_b32 v61, s74, 63 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s70, v31 +; SI-NEXT: v_readfirstlane_b32 s19, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s83, v31 +; SI-NEXT: v_readfirstlane_b32 s71, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s49, v31 +; SI-NEXT: v_readfirstlane_b32 s17, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s80, v31 +; SI-NEXT: v_readfirstlane_b32 s70, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s82, v31 +; SI-NEXT: v_readfirstlane_b32 s37, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s87, v31 +; SI-NEXT: v_readfirstlane_b32 s82, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s84, v31 +; SI-NEXT: v_readfirstlane_b32 s83, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s51, v31 +; SI-NEXT: v_readfirstlane_b32 s86, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s86, v31 +; SI-NEXT: v_readfirstlane_b32 s30, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s94, v31 +; SI-NEXT: v_readfirstlane_b32 s96, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s96, v31 +; SI-NEXT: v_readfirstlane_b32 s48, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s68, v31 +; SI-NEXT: v_readfirstlane_b32 s98, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s34, v31 +; SI-NEXT: v_readfirstlane_b32 s38, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s77, v31 +; SI-NEXT: v_readfirstlane_b32 s68, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s66, v31 +; SI-NEXT: v_readfirstlane_b32 s99, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s78, v31 +; SI-NEXT: v_readfirstlane_b32 s69, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s53, v31 +; SI-NEXT: v_readfirstlane_b32 s49, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s69, v31 +; SI-NEXT: v_readfirstlane_b32 s6, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s30, v31 +; SI-NEXT: v_readfirstlane_b32 s90, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s52, v31 +; SI-NEXT: v_readfirstlane_b32 s34, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s75, v31 +; SI-NEXT: v_readfirstlane_b32 s52, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s23, v31 +; SI-NEXT: v_readfirstlane_b32 s88, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s28, v31 +; SI-NEXT: v_readfirstlane_b32 s8, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s26, v31 +; SI-NEXT: v_readfirstlane_b32 s24, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s25, v31 +; SI-NEXT: v_readfirstlane_b32 s76, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: v_writelane_b32 v62, s25, 8 -; SI-NEXT: v_writelane_b32 v62, s28, 9 +; SI-NEXT: v_writelane_b32 v62, s76, 7 +; SI-NEXT: v_writelane_b32 v62, s8, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s92, v31 -; SI-NEXT: v_writelane_b32 v62, s92, 10 -; SI-NEXT: v_writelane_b32 v62, s75, 11 -; SI-NEXT: v_writelane_b32 v62, s26, 12 -; SI-NEXT: v_writelane_b32 v62, s30, 13 -; SI-NEXT: v_writelane_b32 v62, s23, 14 -; SI-NEXT: v_writelane_b32 v62, s52, 15 -; SI-NEXT: v_writelane_b32 v62, s64, 16 -; SI-NEXT: v_writelane_b32 v62, s17, 17 -; SI-NEXT: v_writelane_b32 v62, s65, 18 -; SI-NEXT: v_writelane_b32 v62, s70, 19 -; SI-NEXT: v_writelane_b32 v62, s71, 20 -; SI-NEXT: v_writelane_b32 v62, s49, 21 -; SI-NEXT: v_writelane_b32 v62, s83, 22 -; SI-NEXT: v_writelane_b32 v62, s80, 23 -; SI-NEXT: v_writelane_b32 v62, s82, 24 -; SI-NEXT: v_writelane_b32 v62, s84, 25 -; SI-NEXT: v_writelane_b32 v62, s87, 26 -; SI-NEXT: v_writelane_b32 v62, s86, 27 -; SI-NEXT: v_writelane_b32 v62, s51, 28 -; SI-NEXT: v_writelane_b32 v62, s96, 29 -; SI-NEXT: v_writelane_b32 v62, s34, 30 -; SI-NEXT: v_writelane_b32 v62, s94, 31 -; SI-NEXT: v_writelane_b32 v62, s53, 32 -; SI-NEXT: v_writelane_b32 v62, s66, 33 -; SI-NEXT: v_writelane_b32 v62, s68, 34 -; SI-NEXT: v_writelane_b32 v62, s69, 35 -; SI-NEXT: v_writelane_b32 v62, s77, 36 -; SI-NEXT: v_writelane_b32 v62, s78, 37 -; SI-NEXT: s_cbranch_scc0 .LBB93_4 +; SI-NEXT: v_readfirstlane_b32 s75, v31 +; SI-NEXT: v_writelane_b32 v62, s75, 9 +; SI-NEXT: v_writelane_b32 v62, s52, 10 +; SI-NEXT: v_writelane_b32 v62, s24, 11 +; SI-NEXT: v_writelane_b32 v62, s90, 12 +; SI-NEXT: v_writelane_b32 v62, s88, 13 +; SI-NEXT: v_writelane_b32 v62, s34, 14 +; SI-NEXT: v_writelane_b32 v62, s17, 15 +; SI-NEXT: v_writelane_b32 v62, s71, 16 +; SI-NEXT: v_writelane_b32 v62, s70, 17 +; SI-NEXT: v_writelane_b32 v62, s37, 18 +; SI-NEXT: v_writelane_b32 v62, s83, 19 +; SI-NEXT: v_writelane_b32 v62, s82, 20 +; SI-NEXT: v_writelane_b32 v62, s30, 21 +; SI-NEXT: v_writelane_b32 v62, s86, 22 +; SI-NEXT: v_writelane_b32 v62, s48, 23 +; SI-NEXT: v_writelane_b32 v62, s38, 24 +; SI-NEXT: v_writelane_b32 v62, s96, 25 +; SI-NEXT: v_writelane_b32 v62, s49, 26 +; SI-NEXT: v_writelane_b32 v62, s99, 27 +; SI-NEXT: v_writelane_b32 v62, s98, 28 +; SI-NEXT: v_writelane_b32 v62, s6, 29 +; SI-NEXT: v_writelane_b32 v62, s68, 30 +; SI-NEXT: v_writelane_b32 v62, s69, 31 +; SI-NEXT: s_cbranch_scc0 .LBB93_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s10, 0xff -; SI-NEXT: s_lshl_b32 s5, s54, 8 +; SI-NEXT: s_lshl_b32 s5, s67, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s67, 8 +; SI-NEXT: s_lshl_b32 s5, s65, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s60, 0xff -; SI-NEXT: s_lshl_b32 s5, s61, 8 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: s_and_b32 s4, s22, 0xff -; SI-NEXT: s_lshl_b32 s5, s35, 8 +; SI-NEXT: s_lshl_b32 s5, s64, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_and_b32 s4, s63, 0xff -; SI-NEXT: s_lshl_b32 s5, s62, 8 +; SI-NEXT: s_and_b32 s4, s92, 0xff +; SI-NEXT: s_lshl_b32 s5, s77, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_readlane_b32 s5, v61, 2 -; SI-NEXT: s_and_b32 s4, s39, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_and_b32 s4, s66, 0xff +; SI-NEXT: s_lshl_b32 s5, s54, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s79, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 1 -; SI-NEXT: v_readlane_b32 s5, v61, 0 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_and_b32 s4, s99, 0xff -; SI-NEXT: s_lshl_b32 s5, s55, 8 +; SI-NEXT: s_and_b32 s4, s31, 0xff +; SI-NEXT: s_lshl_b32 s5, s95, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_and_b32 s4, s95, 0xff -; SI-NEXT: s_lshl_b32 s5, s93, 8 +; SI-NEXT: s_and_b32 s4, s55, 0xff +; SI-NEXT: s_lshl_b32 s5, s91, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s31, 0xff -; SI-NEXT: s_lshl_b32 s5, s91, 8 +; SI-NEXT: s_and_b32 s4, s80, 0xff +; SI-NEXT: s_lshl_b32 s5, s35, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_and_b32 s4, s88, 0xff -; SI-NEXT: s_lshl_b32 s5, s90, 8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s51, 0xff +; SI-NEXT: s_lshl_b32 s5, s60, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: s_lshl_b32 s5, s98, 8 +; SI-NEXT: s_and_b32 s4, s94, 0xff +; SI-NEXT: s_lshl_b32 s5, s61, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: s_and_b32 s4, s36, 0xff -; SI-NEXT: s_lshl_b32 s5, s38, 8 +; SI-NEXT: s_lshl_b32 s5, s63, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_and_b32 s4, s27, 0xff -; SI-NEXT: s_lshl_b32 s5, s8, 8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s5, s62, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_and_b32 s4, s79, 0xff -; SI-NEXT: s_lshl_b32 s5, s9, 8 +; SI-NEXT: s_and_b32 s4, s84, 0xff +; SI-NEXT: s_lshl_b32 s5, s87, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 ; SI-NEXT: s_and_b32 s4, s15, 0xff @@ -176396,11 +176442,11 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_lshl_b32 s5, s42, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_and_b32 s4, s45, 0xff +; SI-NEXT: s_and_b32 s4, s73, 0xff ; SI-NEXT: s_lshl_b32 s5, s44, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_and_b32 s4, s6, 0xff +; SI-NEXT: s_and_b32 s4, s81, 0xff ; SI-NEXT: s_lshl_b32 s5, s74, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 @@ -176408,244 +176454,393 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_lshl_b32 s5, s12, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_and_b32 s4, s56, 0xff +; SI-NEXT: s_and_b32 s4, s57, 0xff ; SI-NEXT: s_lshl_b32 s5, s46, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_and_b32 s4, s59, 0xff -; SI-NEXT: s_lshl_b32 s5, s57, 8 +; SI-NEXT: s_and_b32 s4, s25, 0xff +; SI-NEXT: s_lshl_b32 s5, s58, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_and_b32 s4, s92, 0xff -; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_and_b32 s4, s75, 0xff +; SI-NEXT: s_lshl_b32 s5, s76, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xff -; SI-NEXT: s_lshl_b32 s5, s28, 8 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s8, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_and_b32 s4, s23, 0xff -; SI-NEXT: s_lshl_b32 s5, s75, 8 +; SI-NEXT: s_and_b32 s4, s88, 0xff +; SI-NEXT: s_lshl_b32 s5, s52, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_and_b32 s4, s52, 0xff -; SI-NEXT: s_lshl_b32 s5, s30, 8 +; SI-NEXT: s_and_b32 s4, s34, 0xff +; SI-NEXT: s_lshl_b32 s5, s90, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_and_b32 s4, s69, 0xff -; SI-NEXT: s_lshl_b32 s5, s53, 8 +; SI-NEXT: s_and_b32 s4, s6, 0xff +; SI-NEXT: s_lshl_b32 s5, s49, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_and_b32 s4, s78, 0xff -; SI-NEXT: s_lshl_b32 s5, s66, 8 +; SI-NEXT: s_and_b32 s4, s69, 0xff +; SI-NEXT: s_lshl_b32 s5, s99, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_and_b32 s4, s77, 0xff -; SI-NEXT: s_lshl_b32 s5, s34, 8 +; SI-NEXT: s_and_b32 s4, s68, 0xff +; SI-NEXT: s_lshl_b32 s5, s38, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_and_b32 s4, s68, 0xff -; SI-NEXT: s_lshl_b32 s5, s96, 8 +; SI-NEXT: s_and_b32 s4, s98, 0xff +; SI-NEXT: s_lshl_b32 s5, s48, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_and_b32 s4, s94, 0xff -; SI-NEXT: s_lshl_b32 s5, s86, 8 +; SI-NEXT: s_and_b32 s4, s96, 0xff +; SI-NEXT: s_lshl_b32 s5, s30, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: s_and_b32 s4, s51, 0xff -; SI-NEXT: s_lshl_b32 s5, s84, 8 +; SI-NEXT: s_and_b32 s4, s86, 0xff +; SI-NEXT: s_lshl_b32 s5, s83, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_and_b32 s4, s87, 0xff -; SI-NEXT: s_lshl_b32 s5, s82, 8 +; SI-NEXT: s_and_b32 s4, s82, 0xff +; SI-NEXT: s_lshl_b32 s5, s37, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_and_b32 s4, s80, 0xff -; SI-NEXT: s_lshl_b32 s5, s49, 8 +; SI-NEXT: s_and_b32 s4, s70, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: s_and_b32 s4, s83, 0xff -; SI-NEXT: s_lshl_b32 s5, s70, 8 +; SI-NEXT: s_and_b32 s4, s71, 0xff +; SI-NEXT: s_lshl_b32 s5, s19, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s13, v61, 35 ; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 -; SI-NEXT: s_and_b32 s4, s71, 0xff -; SI-NEXT: s_lshl_b32 s5, s65, 8 +; SI-NEXT: s_and_b32 s4, s13, 0xff +; SI-NEXT: s_lshl_b32 s5, s23, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 -; SI-NEXT: s_and_b32 s4, s17, 0xff -; SI-NEXT: s_lshl_b32 s5, s64, 8 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_mov_b32 s79, s9 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s9, v61, 34 ; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: s_and_b32 s4, s19, 0xff -; SI-NEXT: s_lshl_b32 s5, s48, 8 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s5, s50, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 -; SI-NEXT: s_and_b32 s4, s50, 0xff -; SI-NEXT: s_lshl_b32 s5, s37, 8 +; SI-NEXT: s_and_b32 s4, s53, 0xff +; SI-NEXT: s_lshl_b32 s5, s39, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s8, v61, 36 ; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 -; SI-NEXT: s_and_b32 s4, s8, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_and_b32 s4, s27, 0xff +; SI-NEXT: s_lshl_b32 s5, s78, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_and_b32 s4, s76, 0xff -; SI-NEXT: s_lshl_b32 s5, s58, 8 +; SI-NEXT: s_and_b32 s4, s56, 0xff +; SI-NEXT: s_lshl_b32 s5, s59, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 -; SI-NEXT: s_and_b32 s4, s47, 0xff -; SI-NEXT: s_lshl_b32 s5, s41, 8 +; SI-NEXT: s_and_b32 s4, s41, 0xff +; SI-NEXT: s_lshl_b32 s5, s47, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s21, v61, 33 ; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: s_and_b32 s4, s11, 0xff -; SI-NEXT: s_lshl_b32 s5, s7, 8 +; SI-NEXT: s_and_b32 s4, s7, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: s_and_b32 s4, s97, 0xff -; SI-NEXT: s_lshl_b32 s5, s81, 8 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: s_lshl_b32 s5, s85, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 -; SI-NEXT: s_and_b32 s4, s85, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_and_b32 s4, s45, 0xff +; SI-NEXT: s_lshl_b32 s5, s97, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 ; SI-NEXT: s_and_b32 s4, s40, 0xff ; SI-NEXT: s_lshl_b32 s5, s72, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s69, v61, 35 +; SI-NEXT: v_readlane_b32 s69, v61, 32 ; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 ; SI-NEXT: s_and_b32 s4, s69, 0xff -; SI-NEXT: s_lshl_b32 s5, s73, 8 +; SI-NEXT: s_lshl_b32 s5, s93, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s68, v61, 34 +; SI-NEXT: v_readlane_b32 s68, v61, 31 ; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 ; SI-NEXT: s_and_b32 s4, s68, 0xff ; SI-NEXT: s_lshl_b32 s5, s89, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s66, v61, 33 +; SI-NEXT: v_readlane_b32 s66, v61, 30 +; SI-NEXT: v_readlane_b32 s20, v61, 29 ; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 ; SI-NEXT: s_and_b32 s4, s66, 0xff -; SI-NEXT: s_lshl_b32 s5, s16, 8 +; SI-NEXT: s_lshl_b32 s5, s20, 8 +; SI-NEXT: s_mov_b32 s17, s19 +; SI-NEXT: s_mov_b32 s19, s23 +; SI-NEXT: s_mov_b32 s23, s26 +; SI-NEXT: s_mov_b32 s26, s29 +; SI-NEXT: s_mov_b32 s29, s53 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s53, v61, 32 -; SI-NEXT: v_readlane_b32 s94, v61, 31 +; SI-NEXT: v_readlane_b32 s53, v61, 28 ; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 ; SI-NEXT: s_and_b32 s4, s53, 0xff -; SI-NEXT: s_lshl_b32 s5, s94, 8 +; SI-NEXT: s_lshl_b32 s5, s16, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s34, v61, 30 -; SI-NEXT: v_readlane_b32 s96, v61, 29 +; SI-NEXT: v_readlane_b32 s34, v61, 27 +; SI-NEXT: v_readlane_b32 s6, v61, 26 ; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 ; SI-NEXT: s_and_b32 s4, s34, 0xff -; SI-NEXT: s_lshl_b32 s5, s96, 8 +; SI-NEXT: s_lshl_b32 s5, s6, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s51, v61, 28 -; SI-NEXT: v_readlane_b32 s86, v61, 27 +; SI-NEXT: v_readlane_b32 s98, v61, 25 +; SI-NEXT: v_readlane_b32 s99, v61, 24 ; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 -; SI-NEXT: s_and_b32 s4, s51, 0xff -; SI-NEXT: s_lshl_b32 s5, s86, 8 +; SI-NEXT: s_and_b32 s4, s98, 0xff +; SI-NEXT: s_lshl_b32 s5, s99, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s87, v61, 26 -; SI-NEXT: v_readlane_b32 s84, v61, 25 +; SI-NEXT: v_readlane_b32 s49, v61, 23 +; SI-NEXT: v_readlane_b32 s96, v61, 22 ; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 -; SI-NEXT: s_and_b32 s4, s87, 0xff -; SI-NEXT: s_lshl_b32 s5, s84, 8 +; SI-NEXT: s_and_b32 s4, s49, 0xff +; SI-NEXT: s_lshl_b32 s5, s96, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s82, v61, 24 -; SI-NEXT: v_readlane_b32 s80, v61, 23 +; SI-NEXT: v_readlane_b32 s38, v61, 21 +; SI-NEXT: v_readlane_b32 s48, v61, 20 ; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 -; SI-NEXT: s_and_b32 s4, s82, 0xff -; SI-NEXT: s_lshl_b32 s5, s80, 8 +; SI-NEXT: s_and_b32 s4, s38, 0xff +; SI-NEXT: s_lshl_b32 s5, s48, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s83, v61, 22 -; SI-NEXT: v_readlane_b32 s49, v61, 21 +; SI-NEXT: v_readlane_b32 s86, v61, 19 +; SI-NEXT: v_readlane_b32 s30, v61, 18 ; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 -; SI-NEXT: s_and_b32 s4, s83, 0xff -; SI-NEXT: s_lshl_b32 s5, s49, 8 +; SI-NEXT: s_and_b32 s4, s86, 0xff +; SI-NEXT: s_lshl_b32 s5, s30, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s71, v61, 20 -; SI-NEXT: v_readlane_b32 s70, v61, 19 +; SI-NEXT: v_readlane_b32 s82, v61, 17 +; SI-NEXT: v_readlane_b32 s83, v61, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 -; SI-NEXT: s_and_b32 s4, s71, 0xff -; SI-NEXT: s_lshl_b32 s5, s70, 8 +; SI-NEXT: s_and_b32 s4, s82, 0xff +; SI-NEXT: s_lshl_b32 s5, s83, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s65, v61, 18 -; SI-NEXT: v_readlane_b32 s54, v61, 17 +; SI-NEXT: v_readlane_b32 s37, v61, 15 +; SI-NEXT: v_readlane_b32 s70, v61, 14 ; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 -; SI-NEXT: s_and_b32 s4, s65, 0xff -; SI-NEXT: s_lshl_b32 s5, s54, 8 -; SI-NEXT: s_mov_b32 s17, s19 -; SI-NEXT: s_mov_b32 s19, s50 +; SI-NEXT: s_and_b32 s4, s37, 0xff +; SI-NEXT: s_lshl_b32 s5, s70, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s67, v61, 16 -; SI-NEXT: v_readlane_b32 s50, v61, 15 +; SI-NEXT: v_readlane_b32 s71, v61, 13 +; SI-NEXT: v_readlane_b32 s67, v61, 12 ; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 -; SI-NEXT: s_and_b32 s4, s67, 0xff -; SI-NEXT: s_lshl_b32 s5, s50, 8 +; SI-NEXT: s_and_b32 s4, s71, 0xff +; SI-NEXT: s_lshl_b32 s5, s67, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s64, v61, 14 -; SI-NEXT: v_readlane_b32 s52, v61, 13 +; SI-NEXT: v_readlane_b32 s65, v61, 11 +; SI-NEXT: v_readlane_b32 s64, v61, 10 ; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 -; SI-NEXT: s_and_b32 s4, s64, 0xff -; SI-NEXT: s_lshl_b32 s5, s52, 8 -; SI-NEXT: s_mov_b32 s23, s48 +; SI-NEXT: s_and_b32 s4, s65, 0xff +; SI-NEXT: s_lshl_b32 s5, s64, 8 +; SI-NEXT: s_mov_b32 s88, s50 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s35, v61, 12 -; SI-NEXT: v_readlane_b32 s48, v61, 11 +; SI-NEXT: v_readlane_b32 s54, v61, 9 +; SI-NEXT: v_readlane_b32 s50, v61, 8 ; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 -; SI-NEXT: s_and_b32 s4, s35, 0xff -; SI-NEXT: s_lshl_b32 s5, s48, 8 +; SI-NEXT: s_and_b32 s4, s54, 0xff +; SI-NEXT: s_lshl_b32 s5, s50, 8 +; SI-NEXT: s_mov_b32 s24, s39 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s30, v61, 10 -; SI-NEXT: v_readlane_b32 s39, v61, 9 +; SI-NEXT: v_readlane_b32 s90, v61, 7 +; SI-NEXT: v_readlane_b32 s39, v61, 6 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_and_b32 s4, s30, 0xff +; SI-NEXT: s_and_b32 s4, s90, 0xff ; SI-NEXT: s_lshl_b32 s5, s39, 8 -; SI-NEXT: s_mov_b32 s26, s37 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s37, v61, 8 -; SI-NEXT: v_readlane_b32 s75, v61, 7 +; SI-NEXT: v_readlane_b32 s52, v61, 5 +; SI-NEXT: v_readlane_b32 s75, v61, 4 ; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 -; SI-NEXT: s_and_b32 s4, s37, 0xff +; SI-NEXT: s_and_b32 s4, s52, 0xff ; SI-NEXT: s_lshl_b32 s5, s75, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s92, v61, 6 -; SI-NEXT: v_readlane_b32 s77, v61, 5 +; SI-NEXT: v_readlane_b32 s92, v61, 3 +; SI-NEXT: v_readlane_b32 s77, v61, 2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; SI-NEXT: s_and_b32 s4, s92, 0xff ; SI-NEXT: s_lshl_b32 s5, s77, 8 -; SI-NEXT: s_mov_b32 s28, s29 -; SI-NEXT: s_mov_b32 s29, s76 +; SI-NEXT: s_mov_b32 s8, s27 +; SI-NEXT: s_mov_b32 s27, s78 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s78, v61, 4 -; SI-NEXT: v_readlane_b32 s76, v61, 3 +; SI-NEXT: v_readlane_b32 s78, v61, 1 +; SI-NEXT: v_readlane_b32 s76, v61, 0 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_and_b32 s4, s78, 0xff ; SI-NEXT: s_lshl_b32 s5, s76, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_mov_b32 s99, s55 -; SI-NEXT: s_mov_b32 s20, s88 -; SI-NEXT: s_mov_b32 s24, s98 -; SI-NEXT: s_mov_b32 s59, s58 -; SI-NEXT: s_mov_b32 s56, s47 -; SI-NEXT: s_mov_b32 s46, s41 -; SI-NEXT: s_mov_b32 s12, s11 -; SI-NEXT: s_mov_b32 s11, s7 -; SI-NEXT: s_mov_b32 s7, s97 -; SI-NEXT: s_mov_b32 s97, s81 -; SI-NEXT: s_mov_b32 s81, s85 -; SI-NEXT: s_mov_b32 s6, s40 +; SI-NEXT: s_mov_b32 s31, s95 +; SI-NEXT: s_mov_b32 s57, s56 +; SI-NEXT: s_mov_b32 s25, s59 +; SI-NEXT: s_mov_b32 s14, s41 +; SI-NEXT: s_mov_b32 s46, s47 +; SI-NEXT: s_mov_b32 s12, s7 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s11, s85 +; SI-NEXT: s_mov_b32 s81, s45 +; SI-NEXT: s_mov_b32 s85, s97 +; SI-NEXT: s_mov_b32 s45, s40 ; SI-NEXT: s_mov_b32 s40, s72 -; SI-NEXT: s_mov_b32 s45, s73 +; SI-NEXT: s_mov_b32 s44, s93 ; SI-NEXT: s_mov_b32 s15, s89 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_mov_b32 s55, s93 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_mov_b32 s95, s91 +; SI-NEXT: s_mov_b32 s55, s35 +; SI-NEXT: s_mov_b32 s80, s60 +; SI-NEXT: s_mov_b32 s91, s61 +; SI-NEXT: s_mov_b32 s51, s63 +; SI-NEXT: s_mov_b32 s36, s62 +; SI-NEXT: s_branch .LBB93_3 +; SI-NEXT: .LBB93_2: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: s_mov_b32 s17, s19 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: s_mov_b32 s19, s23 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_mov_b32 s23, s26 +; SI-NEXT: s_mov_b32 s26, s29 +; SI-NEXT: s_mov_b32 s29, s53 +; SI-NEXT: s_mov_b32 s88, s50 +; SI-NEXT: s_mov_b32 s24, s39 +; SI-NEXT: s_mov_b32 s8, s27 +; SI-NEXT: s_mov_b32 s27, s78 +; SI-NEXT: s_mov_b32 s25, s59 +; SI-NEXT: s_mov_b32 s57, s56 +; SI-NEXT: s_mov_b32 s46, s47 +; SI-NEXT: s_mov_b32 s14, s41 +; SI-NEXT: s_mov_b32 s12, s7 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s11, s85 +; SI-NEXT: s_mov_b32 s85, s97 +; SI-NEXT: s_mov_b32 s81, s45 +; SI-NEXT: s_mov_b32 s45, s40 +; SI-NEXT: s_mov_b32 s40, s72 +; SI-NEXT: s_mov_b32 s44, s93 +; SI-NEXT: s_mov_b32 s15, s89 +; SI-NEXT: s_mov_b32 s79, s9 +; SI-NEXT: s_mov_b32 s31, s95 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_readlane_b32 s75, v61, 4 +; SI-NEXT: v_readlane_b32 s76, v61, 0 +; SI-NEXT: v_readlane_b32 s77, v61, 2 +; SI-NEXT: v_readlane_b32 s78, v61, 1 +; SI-NEXT: v_readlane_b32 s92, v61, 3 +; SI-NEXT: v_readlane_b32 s39, v61, 6 +; SI-NEXT: v_readlane_b32 s52, v61, 5 +; SI-NEXT: v_readlane_b32 s90, v61, 7 +; SI-NEXT: v_readlane_b32 s50, v61, 8 +; SI-NEXT: v_readlane_b32 s64, v61, 10 +; SI-NEXT: v_readlane_b32 s54, v61, 9 +; SI-NEXT: v_readlane_b32 s67, v61, 12 +; SI-NEXT: v_readlane_b32 s65, v61, 11 +; SI-NEXT: v_readlane_b32 s70, v61, 14 +; SI-NEXT: v_readlane_b32 s71, v61, 13 +; SI-NEXT: v_readlane_b32 s37, v61, 15 +; SI-NEXT: v_readlane_b32 s83, v61, 16 +; SI-NEXT: v_readlane_b32 s30, v61, 18 +; SI-NEXT: v_readlane_b32 s82, v61, 17 +; SI-NEXT: v_readlane_b32 s48, v61, 20 +; SI-NEXT: v_readlane_b32 s86, v61, 19 +; SI-NEXT: v_readlane_b32 s96, v61, 22 +; SI-NEXT: v_readlane_b32 s38, v61, 21 +; SI-NEXT: v_readlane_b32 s49, v61, 23 +; SI-NEXT: v_readlane_b32 s99, v61, 24 +; SI-NEXT: v_readlane_b32 s6, v61, 26 +; SI-NEXT: v_readlane_b32 s98, v61, 25 ; SI-NEXT: s_mov_b32 s95, s91 -; SI-NEXT: s_mov_b32 s31, s90 -; SI-NEXT: s_cbranch_execnz .LBB93_3 -; SI-NEXT: .LBB93_2: ; %cmp.true +; SI-NEXT: s_mov_b32 s55, s35 +; SI-NEXT: s_mov_b32 s80, s60 +; SI-NEXT: v_readlane_b32 s20, v61, 29 +; SI-NEXT: s_mov_b32 s91, s61 +; SI-NEXT: s_mov_b32 s51, s63 +; SI-NEXT: s_mov_b32 s36, s62 +; SI-NEXT: v_readlane_b32 s34, v61, 27 +; SI-NEXT: v_readlane_b32 s53, v61, 28 +; SI-NEXT: v_readlane_b32 s66, v61, 30 +; SI-NEXT: v_readlane_b32 s68, v61, 31 +; SI-NEXT: v_readlane_b32 s69, v61, 32 +; SI-NEXT: v_readlane_b32 s21, v61, 33 +; SI-NEXT: v_readlane_b32 s9, v61, 34 +; SI-NEXT: v_readlane_b32 s13, v61, 35 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: .LBB93_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_mov_b32 s35, s79 +; SI-NEXT: s_cbranch_vccnz .LBB93_5 +; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: s_add_i32 s4, s78, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s76, 8 @@ -176654,60 +176849,60 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_and_b32 s5, s5, 0xff ; SI-NEXT: s_lshl_b32 vcc_lo, s77, 8 ; SI-NEXT: s_or_b32 s5, vcc_lo, s5 -; SI-NEXT: s_add_i32 vcc_lo, s37, 3 +; SI-NEXT: s_add_i32 vcc_lo, s52, 3 ; SI-NEXT: s_and_b32 vcc_lo, vcc_lo, 0xff ; SI-NEXT: s_lshl_b32 vcc_hi, s75, 8 ; SI-NEXT: s_or_b32 vcc_lo, vcc_hi, vcc_lo -; SI-NEXT: s_add_i32 vcc_hi, s30, 3 +; SI-NEXT: s_add_i32 vcc_hi, s90, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff ; SI-NEXT: s_lshl_b32 s60, s39, 8 ; SI-NEXT: s_or_b32 s60, s60, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s35, 3 +; SI-NEXT: s_add_i32 vcc_hi, s54, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s61, s48, 8 +; SI-NEXT: s_lshl_b32 s61, s50, 8 ; SI-NEXT: s_or_b32 s61, s61, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s64, 3 +; SI-NEXT: s_add_i32 vcc_hi, s65, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s62, s52, 8 +; SI-NEXT: s_lshl_b32 s62, s64, 8 ; SI-NEXT: s_or_b32 s62, s62, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s67, 3 +; SI-NEXT: s_add_i32 vcc_hi, s71, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s63, s50, 8 +; SI-NEXT: s_lshl_b32 s63, s67, 8 ; SI-NEXT: s_or_b32 s10, s63, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s65, 3 +; SI-NEXT: s_add_i32 vcc_hi, s37, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s72, s54, 8 +; SI-NEXT: s_lshl_b32 s72, s70, 8 ; SI-NEXT: s_or_b32 s72, s72, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s71, 3 +; SI-NEXT: s_add_i32 vcc_hi, s82, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s73, s70, 8 +; SI-NEXT: s_lshl_b32 s73, s83, 8 ; SI-NEXT: s_or_b32 s73, s73, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s83, 3 +; SI-NEXT: s_add_i32 vcc_hi, s86, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s74, s49, 8 +; SI-NEXT: s_lshl_b32 s74, s30, 8 ; SI-NEXT: s_or_b32 s74, s74, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s82, 3 +; SI-NEXT: s_add_i32 vcc_hi, s38, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s75, s80, 8 +; SI-NEXT: s_lshl_b32 s75, s48, 8 ; SI-NEXT: s_or_b32 s75, s75, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s87, 3 +; SI-NEXT: s_add_i32 vcc_hi, s49, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s76, s84, 8 +; SI-NEXT: s_lshl_b32 s76, s96, 8 ; SI-NEXT: s_or_b32 s76, s76, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s51, 3 +; SI-NEXT: s_add_i32 vcc_hi, s98, 3 ; SI-NEXT: s_add_i32 s93, s53, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s77, s86, 8 +; SI-NEXT: s_lshl_b32 s77, s99, 8 ; SI-NEXT: s_add_i32 s89, s34, 3 ; SI-NEXT: s_and_b32 s93, s93, 0xff -; SI-NEXT: s_lshl_b32 s78, s94, 8 +; SI-NEXT: s_lshl_b32 s78, s16, 8 ; SI-NEXT: s_add_i32 s34, s66, 3 ; SI-NEXT: s_or_b32 s77, s77, vcc_hi ; SI-NEXT: s_and_b32 s89, s89, 0xff -; SI-NEXT: s_lshl_b32 vcc_hi, s96, 8 +; SI-NEXT: s_lshl_b32 vcc_hi, s6, 8 ; SI-NEXT: s_or_b32 s22, s78, s93 ; SI-NEXT: s_and_b32 s93, s34, 0xff -; SI-NEXT: s_lshl_b32 s92, s16, 8 +; SI-NEXT: s_lshl_b32 s92, s20, 8 ; SI-NEXT: s_add_i32 s53, s68, 3 ; SI-NEXT: s_or_b32 s89, vcc_hi, s89 ; SI-NEXT: s_or_b32 s92, s92, s93 @@ -176716,261 +176911,251 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_add_i32 s66, s69, 3 ; SI-NEXT: s_or_b32 s93, vcc_hi, s93 ; SI-NEXT: s_and_b32 vcc_hi, s66, 0xff -; SI-NEXT: s_lshl_b32 s34, s45, 8 -; SI-NEXT: s_add_i32 s68, s6, 3 +; SI-NEXT: s_lshl_b32 s34, s44, 8 +; SI-NEXT: s_add_i32 s68, s45, 3 ; SI-NEXT: s_or_b32 vcc_hi, s34, vcc_hi ; SI-NEXT: s_and_b32 s34, s68, 0xff ; SI-NEXT: s_lshl_b32 s39, s40, 8 ; SI-NEXT: s_add_i32 s69, s81, 3 ; SI-NEXT: s_or_b32 s34, s39, s34 ; SI-NEXT: s_and_b32 s39, s69, 0xff -; SI-NEXT: s_lshl_b32 s52, s21, 8 +; SI-NEXT: s_lshl_b32 s52, s85, 8 ; SI-NEXT: s_add_i32 s81, s7, 3 ; SI-NEXT: s_or_b32 s39, s52, s39 ; SI-NEXT: s_and_b32 s52, s81, 0xff -; SI-NEXT: s_lshl_b32 s53, s97, 8 +; SI-NEXT: s_lshl_b32 s53, s11, 8 ; SI-NEXT: s_add_i32 s85, s12, 3 ; SI-NEXT: s_or_b32 s52, s53, s52 ; SI-NEXT: s_and_b32 s53, s85, 0xff -; SI-NEXT: s_lshl_b32 s64, s11, 8 -; SI-NEXT: s_add_i32 s97, s56, 3 +; SI-NEXT: s_lshl_b32 s64, s21, 8 +; SI-NEXT: s_add_i32 s97, s14, 3 ; SI-NEXT: s_or_b32 s53, s64, s53 ; SI-NEXT: s_and_b32 s64, s97, 0xff ; SI-NEXT: s_lshl_b32 s66, s46, 8 -; SI-NEXT: s_add_i32 s21, s29, 3 +; SI-NEXT: s_add_i32 s21, s57, 3 ; SI-NEXT: s_or_b32 s64, s66, s64 ; SI-NEXT: s_and_b32 s21, s21, 0xff -; SI-NEXT: s_lshl_b32 s66, s59, 8 +; SI-NEXT: s_lshl_b32 s66, s25, 8 ; SI-NEXT: s_add_i32 s25, s8, 3 ; SI-NEXT: s_or_b32 s66, s66, s21 ; SI-NEXT: s_and_b32 s21, s25, 0xff -; SI-NEXT: s_lshl_b32 s6, s28, 8 -; SI-NEXT: s_add_i32 s29, s19, 3 +; SI-NEXT: s_lshl_b32 s6, s27, 8 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_readlane_b32 s16, v62, 14 ; SI-NEXT: s_or_b32 s67, s6, s21 ; SI-NEXT: s_and_b32 s6, s29, 0xff -; SI-NEXT: s_lshl_b32 s18, s26, 8 -; SI-NEXT: s_add_i32 s28, s17, 3 +; SI-NEXT: s_lshl_b32 s18, s24, 8 +; SI-NEXT: s_add_i32 s28, s9, 3 +; SI-NEXT: s_add_i32 s27, s16, 3 +; SI-NEXT: v_readlane_b32 s16, v62, 12 ; SI-NEXT: s_or_b32 s68, s18, s6 ; SI-NEXT: s_and_b32 s6, s28, 0xff -; SI-NEXT: s_lshl_b32 s18, s23, 8 +; SI-NEXT: s_lshl_b32 s18, s88, 8 +; SI-NEXT: s_add_i32 s7, s23, 3 +; SI-NEXT: s_lshl_b32 s23, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v62, 13 ; SI-NEXT: s_or_b32 s69, s18, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 17 -; SI-NEXT: s_add_i32 s7, s6, 3 -; SI-NEXT: v_readlane_b32 s16, v62, 15 ; SI-NEXT: s_and_b32 s6, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v62, 16 -; SI-NEXT: s_add_i32 s27, s16, 3 -; SI-NEXT: v_readlane_b32 s16, v62, 13 -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_lshl_b32 s23, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v62, 14 -; SI-NEXT: s_mov_b32 s91, s24 -; SI-NEXT: s_or_b32 s70, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 20 +; SI-NEXT: s_lshl_b32 s7, s26, 8 +; SI-NEXT: s_add_i32 s11, s13, 3 ; SI-NEXT: s_add_i32 s24, s16, 3 -; SI-NEXT: v_readlane_b32 s16, v62, 11 -; SI-NEXT: s_add_i32 s11, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 18 -; SI-NEXT: s_lshl_b32 s19, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v62, 12 -; SI-NEXT: s_mov_b32 s90, s20 +; SI-NEXT: v_readlane_b32 s16, v62, 10 +; SI-NEXT: s_or_b32 s70, s7, s6 ; SI-NEXT: s_and_b32 s6, s11, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_add_i32 s20, s16, 3 -; SI-NEXT: v_readlane_b32 s16, v62, 9 +; SI-NEXT: s_lshl_b32 s7, s19, 8 +; SI-NEXT: s_lshl_b32 s19, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v62, 11 ; SI-NEXT: s_or_b32 s71, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 22 +; SI-NEXT: v_readlane_b32 s6, v62, 16 +; SI-NEXT: s_add_i32 s20, s16, 3 +; SI-NEXT: v_readlane_b32 s16, v62, 8 +; SI-NEXT: s_add_i32 s12, s6, 3 +; SI-NEXT: s_lshl_b32 s7, s17, 8 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s17, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v62, 10 -; SI-NEXT: s_add_i32 s12, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 19 +; SI-NEXT: v_readlane_b32 s16, v62, 9 +; SI-NEXT: s_and_b32 s6, s12, 0xff ; SI-NEXT: s_or_b32 s17, s17, s20 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s20, v62, 8 -; SI-NEXT: s_and_b32 s6, s12, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: v_readlane_b32 s20, v62, 7 +; SI-NEXT: s_or_b32 s81, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 17 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s20, s20, 8 -; SI-NEXT: s_or_b32 s81, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 23 +; SI-NEXT: s_add_i32 s14, s6, 3 +; SI-NEXT: v_readlane_b32 s7, v62, 15 ; SI-NEXT: s_and_b32 s24, s24, 0xff ; SI-NEXT: s_or_b32 s16, s20, s16 -; SI-NEXT: v_readlane_b32 s20, v62, 7 -; SI-NEXT: s_add_i32 s14, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 21 -; SI-NEXT: s_or_b32 s19, s19, s24 -; SI-NEXT: s_add_i32 s98, s20, 3 -; SI-NEXT: v_readlane_b32 s24, v62, 6 +; SI-NEXT: v_readlane_b32 s20, v62, 6 ; SI-NEXT: s_and_b32 s6, s14, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_or_b32 s19, s19, s24 +; SI-NEXT: s_add_i32 s98, s20, 3 +; SI-NEXT: v_readlane_b32 s24, v62, 5 +; SI-NEXT: s_or_b32 s83, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 20 ; SI-NEXT: s_and_b32 s20, s98, 0xff ; SI-NEXT: s_lshl_b32 s24, s24, 8 -; SI-NEXT: s_or_b32 s83, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 26 +; SI-NEXT: s_add_i32 s41, s6, 3 +; SI-NEXT: v_readlane_b32 s7, v62, 18 ; SI-NEXT: s_and_b32 s27, s27, 0xff ; SI-NEXT: s_or_b32 s20, s24, s20 -; SI-NEXT: v_readlane_b32 s24, v62, 5 -; SI-NEXT: s_add_i32 s41, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 24 -; SI-NEXT: s_or_b32 s23, s23, s27 -; SI-NEXT: s_add_i32 s86, s24, 3 -; SI-NEXT: v_readlane_b32 s27, v62, 4 +; SI-NEXT: v_readlane_b32 s24, v62, 4 ; SI-NEXT: s_and_b32 s6, s41, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_or_b32 s23, s23, s27 +; SI-NEXT: s_add_i32 s86, s24, 3 +; SI-NEXT: v_readlane_b32 s27, v62, 3 +; SI-NEXT: s_or_b32 s85, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 22 ; SI-NEXT: s_and_b32 s24, s86, 0xff ; SI-NEXT: s_lshl_b32 s27, s27, 8 -; SI-NEXT: s_or_b32 s85, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 28 -; SI-NEXT: s_or_b32 s24, s27, s24 -; SI-NEXT: v_readlane_b32 s27, v62, 3 ; SI-NEXT: s_add_i32 s46, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 25 -; SI-NEXT: s_add_i32 s12, s73, 0x300 -; SI-NEXT: s_add_i32 s82, s27, 3 -; SI-NEXT: v_readlane_b32 s73, v62, 2 +; SI-NEXT: v_readlane_b32 s7, v62, 19 +; SI-NEXT: s_or_b32 s24, s27, s24 +; SI-NEXT: v_readlane_b32 s27, v62, 2 ; SI-NEXT: s_and_b32 s6, s46, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_add_i32 s12, s73, 0x300 +; SI-NEXT: s_add_i32 s82, s27, 3 +; SI-NEXT: v_readlane_b32 s73, v62, 1 +; SI-NEXT: s_or_b32 s96, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 25 ; SI-NEXT: s_and_b32 s27, s82, 0xff ; SI-NEXT: s_lshl_b32 s73, s73, 8 -; SI-NEXT: s_or_b32 s96, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 31 -; SI-NEXT: s_or_b32 s27, s73, s27 -; SI-NEXT: v_readlane_b32 s73, v62, 1 ; SI-NEXT: s_add_i32 s47, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 27 -; SI-NEXT: s_add_i32 s13, s74, 0x300 -; SI-NEXT: s_add_i32 s65, s73, 3 -; SI-NEXT: v_readlane_b32 s74, v62, 0 +; SI-NEXT: v_readlane_b32 s7, v62, 21 +; SI-NEXT: s_or_b32 s27, s73, s27 +; SI-NEXT: v_readlane_b32 s73, v62, 0 ; SI-NEXT: s_and_b32 s6, s47, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_add_i32 s13, s74, 0x300 +; SI-NEXT: s_add_i32 s65, s73, 3 +; SI-NEXT: v_readlane_b32 s74, v61, 63 +; SI-NEXT: s_or_b32 s97, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 28 ; SI-NEXT: s_and_b32 s73, s65, 0xff ; SI-NEXT: s_lshl_b32 s74, s74, 8 -; SI-NEXT: s_or_b32 s97, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 34 -; SI-NEXT: s_or_b32 s73, s74, s73 -; SI-NEXT: v_readlane_b32 s74, v61, 63 ; SI-NEXT: s_add_i32 s56, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 29 -; SI-NEXT: s_add_i32 s14, s75, 0x300 -; SI-NEXT: s_add_i32 s54, s74, 3 -; SI-NEXT: v_readlane_b32 s75, v61, 62 +; SI-NEXT: v_readlane_b32 s7, v62, 23 +; SI-NEXT: s_or_b32 s73, s74, s73 +; SI-NEXT: v_readlane_b32 s74, v61, 62 ; SI-NEXT: s_and_b32 s6, s56, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_add_i32 s14, s75, 0x300 +; SI-NEXT: s_add_i32 s54, s74, 3 +; SI-NEXT: v_readlane_b32 s75, v61, 61 +; SI-NEXT: s_or_b32 s63, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 30 ; SI-NEXT: s_and_b32 s74, s54, 0xff ; SI-NEXT: s_lshl_b32 s75, s75, 8 -; SI-NEXT: s_or_b32 s63, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 36 -; SI-NEXT: s_or_b32 s74, s75, s74 -; SI-NEXT: v_readlane_b32 s75, v61, 61 ; SI-NEXT: s_add_i32 s58, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 30 -; SI-NEXT: s_add_i32 s15, s76, 0x300 -; SI-NEXT: s_add_i32 s50, s75, 3 -; SI-NEXT: v_readlane_b32 s76, v61, 60 +; SI-NEXT: v_readlane_b32 s7, v62, 24 +; SI-NEXT: s_or_b32 s74, s75, s74 +; SI-NEXT: v_readlane_b32 s75, v61, 60 ; SI-NEXT: s_and_b32 s6, s58, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_add_i32 s15, s76, 0x300 +; SI-NEXT: s_add_i32 s50, s75, 3 +; SI-NEXT: v_readlane_b32 s76, v61, 59 +; SI-NEXT: s_or_b32 s79, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 31 ; SI-NEXT: s_and_b32 s75, s50, 0xff ; SI-NEXT: s_lshl_b32 s76, s76, 8 -; SI-NEXT: s_or_b32 s79, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 37 -; SI-NEXT: s_or_b32 s75, s76, s75 -; SI-NEXT: v_readlane_b32 s76, v61, 59 ; SI-NEXT: s_add_i32 s59, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 33 -; SI-NEXT: s_add_i32 s18, s77, 0x300 -; SI-NEXT: s_add_i32 s48, s76, 3 -; SI-NEXT: v_readlane_b32 s77, v61, 58 +; SI-NEXT: v_readlane_b32 s7, v62, 27 +; SI-NEXT: s_or_b32 s75, s76, s75 +; SI-NEXT: v_readlane_b32 s76, v61, 58 ; SI-NEXT: s_and_b32 s6, s59, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_add_i32 s18, s77, 0x300 +; SI-NEXT: s_add_i32 s48, s76, 3 +; SI-NEXT: v_readlane_b32 s77, v61, 57 +; SI-NEXT: s_or_b32 s78, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 29 ; SI-NEXT: s_and_b32 s76, s48, 0xff ; SI-NEXT: s_lshl_b32 s77, s77, 8 -; SI-NEXT: s_or_b32 s78, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 35 -; SI-NEXT: s_or_b32 s76, s77, s76 -; SI-NEXT: v_readlane_b32 s77, v61, 57 ; SI-NEXT: s_add_i32 s57, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 32 +; SI-NEXT: v_readlane_b32 s7, v62, 26 +; SI-NEXT: s_or_b32 s76, s77, s76 +; SI-NEXT: v_readlane_b32 s77, v61, 56 +; SI-NEXT: s_and_b32 s6, s57, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_add_i32 s11, s72, 0x300 ; SI-NEXT: s_add_i32 s72, s79, 0x300 ; SI-NEXT: s_add_i32 s37, s77, 3 -; SI-NEXT: v_readlane_b32 s79, v61, 56 -; SI-NEXT: s_and_b32 s6, s57, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: v_readlane_b32 s79, v61, 55 +; SI-NEXT: s_or_b32 s88, s7, s6 ; SI-NEXT: s_and_b32 s77, s37, 0xff ; SI-NEXT: s_lshl_b32 s79, s79, 8 -; SI-NEXT: s_or_b32 s88, s7, s6 -; SI-NEXT: s_or_b32 s77, s79, s77 -; SI-NEXT: v_readlane_b32 s79, v61, 55 +; SI-NEXT: s_add_i32 s35, s35, 3 ; SI-NEXT: s_add_i32 s21, s89, 0x300 ; SI-NEXT: s_add_i32 s89, s88, 0x300 -; SI-NEXT: s_add_i32 s35, s79, 3 -; SI-NEXT: v_readlane_b32 s88, v61, 54 +; SI-NEXT: s_or_b32 s77, s79, s77 ; SI-NEXT: s_and_b32 s79, s35, 0xff -; SI-NEXT: s_lshl_b32 s88, s88, 8 +; SI-NEXT: s_lshl_b32 s88, s36, 8 ; SI-NEXT: s_or_b32 s79, s88, s79 -; SI-NEXT: v_readlane_b32 s88, v61, 53 -; SI-NEXT: s_add_i32 s25, s92, 0x300 +; SI-NEXT: v_readlane_b32 s88, v61, 54 ; SI-NEXT: s_add_i32 s30, s88, 3 -; SI-NEXT: v_readlane_b32 s92, v61, 52 +; SI-NEXT: s_add_i32 s25, s92, 0x300 ; SI-NEXT: s_and_b32 s88, s30, 0xff -; SI-NEXT: s_lshl_b32 s92, s92, 8 +; SI-NEXT: s_lshl_b32 s92, s51, 8 +; SI-NEXT: s_add_i32 s94, s94, 3 +; SI-NEXT: v_readlane_b32 s90, v61, 53 ; SI-NEXT: s_or_b32 s88, s92, s88 -; SI-NEXT: v_readlane_b32 s92, v61, 51 -; SI-NEXT: s_add_i32 s94, s92, 3 ; SI-NEXT: s_and_b32 s92, s94, 0xff ; SI-NEXT: s_lshl_b32 s91, s91, 8 ; SI-NEXT: s_add_i32 s90, s90, 3 ; SI-NEXT: s_or_b32 s91, s91, s92 ; SI-NEXT: s_and_b32 s90, s90, 0xff -; SI-NEXT: s_lshl_b32 s92, s31, 8 +; SI-NEXT: s_lshl_b32 s92, s80, 8 ; SI-NEXT: s_or_b32 s90, s92, s90 -; SI-NEXT: v_readlane_b32 s92, v61, 50 +; SI-NEXT: v_readlane_b32 s92, v61, 52 ; SI-NEXT: s_add_i32 s92, s92, 3 ; SI-NEXT: s_add_i32 s26, s93, 0x300 ; SI-NEXT: s_and_b32 s92, s92, 0xff -; SI-NEXT: s_lshl_b32 s93, s95, 8 +; SI-NEXT: s_lshl_b32 s93, s55, 8 ; SI-NEXT: s_or_b32 s92, s93, s92 -; SI-NEXT: v_readlane_b32 s93, v61, 49 +; SI-NEXT: v_readlane_b32 s93, v61, 51 ; SI-NEXT: s_add_i32 s93, s93, 3 ; SI-NEXT: s_and_b32 s93, s93, 0xff -; SI-NEXT: s_lshl_b32 s94, s55, 8 +; SI-NEXT: s_lshl_b32 s94, s95, 8 ; SI-NEXT: s_or_b32 s93, s94, s93 -; SI-NEXT: v_readlane_b32 s94, v61, 48 +; SI-NEXT: v_readlane_b32 s94, v61, 49 ; SI-NEXT: s_add_i32 s94, s94, 3 ; SI-NEXT: s_and_b32 s94, s94, 0xff -; SI-NEXT: s_lshl_b32 s95, s99, 8 +; SI-NEXT: s_lshl_b32 s95, s31, 8 ; SI-NEXT: s_or_b32 s94, s95, s94 -; SI-NEXT: v_readlane_b32 s95, v61, 1 +; SI-NEXT: v_readlane_b32 s95, v61, 50 ; SI-NEXT: s_add_i32 s95, s95, 3 -; SI-NEXT: v_readlane_b32 s30, v61, 0 +; SI-NEXT: v_readlane_b32 s30, v61, 48 ; SI-NEXT: s_add_i32 s6, vcc_lo, 0x300 ; SI-NEXT: s_and_b32 s95, s95, 0xff ; SI-NEXT: s_lshl_b32 vcc_lo, s30, 8 ; SI-NEXT: v_readlane_b32 s30, v61, 47 ; SI-NEXT: s_or_b32 s95, vcc_lo, s95 ; SI-NEXT: s_add_i32 vcc_lo, s30, 3 -; SI-NEXT: v_readlane_b32 s30, v61, 2 +; SI-NEXT: v_readlane_b32 s30, v61, 46 ; SI-NEXT: s_add_i32 s28, vcc_hi, 0x300 ; SI-NEXT: s_and_b32 vcc_lo, vcc_lo, 0xff ; SI-NEXT: s_lshl_b32 vcc_hi, s30, 8 -; SI-NEXT: v_readlane_b32 s30, v61, 46 +; SI-NEXT: v_readlane_b32 s30, v61, 45 ; SI-NEXT: s_or_b32 vcc_lo, vcc_hi, vcc_lo ; SI-NEXT: s_add_i32 vcc_hi, s30, 3 -; SI-NEXT: v_readlane_b32 s30, v61, 45 +; SI-NEXT: v_readlane_b32 s30, v61, 44 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff ; SI-NEXT: s_lshl_b32 s30, s30, 8 ; SI-NEXT: s_or_b32 vcc_hi, s30, vcc_hi -; SI-NEXT: v_readlane_b32 s30, v61, 44 +; SI-NEXT: v_readlane_b32 s30, v61, 43 ; SI-NEXT: s_add_i32 s30, s30, 3 -; SI-NEXT: v_readlane_b32 s31, v61, 43 +; SI-NEXT: v_readlane_b32 s31, v61, 42 ; SI-NEXT: s_and_b32 s30, s30, 0xff ; SI-NEXT: s_lshl_b32 s31, s31, 8 ; SI-NEXT: s_or_b32 s30, s31, s30 -; SI-NEXT: v_readlane_b32 s31, v61, 42 +; SI-NEXT: v_readlane_b32 s31, v61, 41 ; SI-NEXT: s_add_i32 s29, s34, 0x300 ; SI-NEXT: s_add_i32 s31, s31, 3 -; SI-NEXT: v_readlane_b32 s34, v61, 41 +; SI-NEXT: v_readlane_b32 s34, v61, 40 ; SI-NEXT: s_and_b32 s31, s31, 0xff ; SI-NEXT: s_lshl_b32 s34, s34, 8 ; SI-NEXT: s_or_b32 s31, s34, s31 @@ -176978,25 +177163,25 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v1, s31 ; SI-NEXT: s_addk_i32 s30, 0x300 ; SI-NEXT: s_addk_i32 vcc_hi, 0x300 -; SI-NEXT: v_readlane_b32 s34, v61, 40 +; SI-NEXT: v_readlane_b32 s34, v61, 39 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s30 ; SI-NEXT: s_add_i32 s34, s34, 3 -; SI-NEXT: v_readlane_b32 s35, v61, 39 +; SI-NEXT: v_readlane_b32 s35, v61, 38 ; SI-NEXT: s_and_b32 s34, s34, 0xff ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, vcc_hi ; SI-NEXT: s_lshl_b32 s35, s35, 8 -; SI-NEXT: s_addk_i32 vcc_lo, 0x300 +; SI-NEXT: s_addk_i32 s95, 0x300 ; SI-NEXT: s_or_b32 s34, s35, s34 -; SI-NEXT: v_readlane_b32 s35, v61, 38 +; SI-NEXT: v_readlane_b32 s35, v61, 37 ; SI-NEXT: s_add_i32 s35, s35, 3 -; SI-NEXT: v_readlane_b32 s36, v61, 37 +; SI-NEXT: v_readlane_b32 s36, v61, 36 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, vcc_lo +; SI-NEXT: v_cvt_f32_f16_e32 v1, s95 ; SI-NEXT: s_and_b32 s35, s35, 0xff ; SI-NEXT: s_lshl_b32 s36, s36, 8 ; SI-NEXT: s_or_b32 s35, s36, s35 @@ -177043,19 +177228,19 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_addk_i32 s92, 0x300 ; SI-NEXT: s_addk_i32 s93, 0x300 ; SI-NEXT: s_addk_i32 s94, 0x300 -; SI-NEXT: s_addk_i32 s95, 0x300 +; SI-NEXT: s_addk_i32 vcc_lo, 0x300 ; SI-NEXT: s_addk_i32 s34, 0x300 ; SI-NEXT: s_addk_i32 s35, 0x300 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s35 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s34 +; SI-NEXT: v_cvt_f32_f16_e32 v7, vcc_lo ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v7, s95 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s94 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s93 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s92 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s92 ; SI-NEXT: v_cvt_f32_f16_e32 v9, s90 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s91 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s88 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s88 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s79 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s77 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s76 @@ -177108,7 +177293,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: .LBB93_3: ; %end +; SI-NEXT: .LBB93_5: ; %end ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -177163,26 +177348,26 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_add_i32_e32 v6, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_add_i32_e32 v6, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v8 ; SI-NEXT: v_add_i32_e32 v7, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 @@ -177197,7 +177382,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v11 ; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 @@ -177395,134 +177580,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB93_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: s_mov_b32 s17, s19 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: s_mov_b32 s19, s50 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: s_mov_b32 s23, s48 -; SI-NEXT: s_mov_b32 s26, s37 -; SI-NEXT: s_mov_b32 s28, s29 -; SI-NEXT: s_mov_b32 s29, s76 -; SI-NEXT: s_mov_b32 s59, s58 -; SI-NEXT: s_mov_b32 s56, s47 -; SI-NEXT: s_mov_b32 s46, s41 -; SI-NEXT: s_mov_b32 s12, s11 -; SI-NEXT: s_mov_b32 s11, s7 -; SI-NEXT: s_mov_b32 s7, s97 -; SI-NEXT: s_mov_b32 s97, s81 -; SI-NEXT: s_mov_b32 s81, s85 -; SI-NEXT: s_mov_b32 s6, s40 -; SI-NEXT: s_mov_b32 s40, s72 -; SI-NEXT: s_mov_b32 s45, s73 -; SI-NEXT: s_mov_b32 s15, s89 -; SI-NEXT: s_mov_b32 s24, s98 -; SI-NEXT: s_mov_b32 s20, s88 -; SI-NEXT: s_mov_b32 s99, s55 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_readlane_b32 s75, v61, 7 -; SI-NEXT: v_readlane_b32 s76, v61, 3 -; SI-NEXT: v_readlane_b32 s77, v61, 5 -; SI-NEXT: v_readlane_b32 s78, v61, 4 -; SI-NEXT: v_readlane_b32 s92, v61, 6 -; SI-NEXT: v_readlane_b32 s39, v61, 9 -; SI-NEXT: v_readlane_b32 s37, v61, 8 -; SI-NEXT: v_readlane_b32 s30, v61, 10 -; SI-NEXT: v_readlane_b32 s48, v61, 11 -; SI-NEXT: v_readlane_b32 s52, v61, 13 -; SI-NEXT: v_readlane_b32 s35, v61, 12 -; SI-NEXT: v_readlane_b32 s50, v61, 15 -; SI-NEXT: v_readlane_b32 s64, v61, 14 -; SI-NEXT: v_readlane_b32 s54, v61, 17 -; SI-NEXT: v_readlane_b32 s67, v61, 16 -; SI-NEXT: v_readlane_b32 s65, v61, 18 -; SI-NEXT: v_readlane_b32 s70, v61, 19 -; SI-NEXT: v_readlane_b32 s49, v61, 21 -; SI-NEXT: v_readlane_b32 s71, v61, 20 -; SI-NEXT: v_readlane_b32 s80, v61, 23 -; SI-NEXT: v_readlane_b32 s83, v61, 22 -; SI-NEXT: v_readlane_b32 s84, v61, 25 -; SI-NEXT: v_readlane_b32 s82, v61, 24 -; SI-NEXT: v_readlane_b32 s87, v61, 26 -; SI-NEXT: v_readlane_b32 s86, v61, 27 -; SI-NEXT: v_readlane_b32 s96, v61, 29 -; SI-NEXT: v_readlane_b32 s51, v61, 28 -; SI-NEXT: s_mov_b32 s55, s93 -; SI-NEXT: s_mov_b32 s95, s91 -; SI-NEXT: v_readlane_b32 s94, v61, 31 -; SI-NEXT: s_mov_b32 s31, s90 -; SI-NEXT: v_readlane_b32 s34, v61, 30 -; SI-NEXT: v_readlane_b32 s53, v61, 32 -; SI-NEXT: v_readlane_b32 s66, v61, 33 -; SI-NEXT: v_readlane_b32 s68, v61, 34 -; SI-NEXT: v_readlane_b32 s69, v61, 35 -; SI-NEXT: v_readlane_b32 s8, v61, 36 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: s_branch .LBB93_2 ; ; VI-LABEL: bitcast_v128i8_to_v64f16_scalar: ; VI: ; %bb.0: @@ -177584,13 +177641,14 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v40, 8, v27 ; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 @@ -177602,46 +177660,42 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v8 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v10 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v12 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v26 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v24 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 @@ -177650,34 +177704,37 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 -; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v22 -; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v26 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v28 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v2 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 @@ -177696,6 +177753,11 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 @@ -177704,12 +177766,8 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:36 -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 ; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v3 ; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 -; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v0 ; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:44 @@ -177718,47 +177776,45 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76 ; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:108 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:116 ; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:172 ; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:180 ; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:196 ; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:212 ; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:252 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:284 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:268 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:324 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:324 ; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill @@ -177768,46 +177824,50 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB93_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload @@ -177824,11 +177884,10 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -177852,6 +177911,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v17, v10 ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload @@ -177868,38 +177928,43 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v0, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v42 +; VI-NEXT: v_mov_b32_e32 v42, v44 +; VI-NEXT: v_mov_b32_e32 v44, v45 +; VI-NEXT: v_mov_b32_e32 v45, v62 +; VI-NEXT: v_or_b32_sdwa v2, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v53, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v34, v24 ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v36, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v37, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -177907,77 +177972,74 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v0, v38, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v39, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v48, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v45, v62 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v48, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v32, v1 ; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v54, v22 -; VI-NEXT: v_mov_b32_e32 v41, v24 ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v34, v0 +; VI-NEXT: v_mov_b32_e32 v33, v0 ; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v37, v1 -; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v55, v26 +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v50, v26 ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v39, v0 -; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v49, v1 -; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v43, v27 +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v51, v0 -; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v35, v1 -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v53, v28 +; VI-NEXT: v_mov_b32_e32 v53, v1 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v52, v28 ; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v47, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v0 -; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v57, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v47, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v36, v0 +; VI-NEXT: v_mov_b32_e32 v55, v0 +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v35, v0 ; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v41, v1 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v63, v27 +; VI-NEXT: v_mov_b32_e32 v46, v57 ; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v56, v0 +; VI-NEXT: v_mov_b32_e32 v36, v0 ; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v58, v1 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v61, v60 -; VI-NEXT: v_mov_b32_e32 v60, v59 +; VI-NEXT: v_mov_b32_e32 v56, v1 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v61, v59 ; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload @@ -177989,55 +178051,53 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v45, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v44, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v50, v0 +; VI-NEXT: v_mov_b32_e32 v58, v0 ; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v1, v62, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v52, v0 -; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v0 +; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v59, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v46, v1 -; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v60, v1 +; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v63, v0 -; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v0 +; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v47, v1 -; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v57, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 @@ -178069,12 +178129,10 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_cbranch_execnz .LBB93_3 ; VI-NEXT: .LBB93_2: ; %cmp.true -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v59 -; VI-NEXT: v_or_b32_sdwa v29, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 @@ -178093,165 +178151,147 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: s_lshl_b32 s9, s19, 8 ; VI-NEXT: s_add_i32 s16, s16, 3 ; VI-NEXT: s_lshl_b32 s10, s17, 8 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v26, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v28, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v59 +; VI-NEXT: v_or_b32_sdwa v25, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v62 -; VI-NEXT: v_or_b32_sdwa v28, v43, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 -; VI-NEXT: v_or_b32_sdwa v53, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v27, v63, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v45 -; VI-NEXT: v_or_b32_sdwa v27, v55, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v52, v43, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 +; VI-NEXT: v_or_b32_sdwa v26, v50, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 -; VI-NEXT: v_or_b32_sdwa v52, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v63, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v40 -; VI-NEXT: v_or_b32_sdwa v25, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v60 -; VI-NEXT: v_or_b32_sdwa v59, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v43, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v61 -; VI-NEXT: v_or_b32_sdwa v24, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v59, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v46 +; VI-NEXT: v_or_b32_sdwa v24, v56, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v48, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v48, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 ; VI-NEXT: v_or_b32_sdwa v24, v24, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: v_or_b32_sdwa v23, v41, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v38, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v38, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 ; VI-NEXT: v_or_b32_sdwa v23, v23, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v22, v54, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v22, v34, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v50, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v50 +; VI-NEXT: v_or_b32_sdwa v36, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v36 ; VI-NEXT: v_or_b32_sdwa v22, v22, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v22 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v21, v35, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v21, v53, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v54, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v53, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: v_or_b32_sdwa v20, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: v_or_b32_sdwa v49, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v49 ; VI-NEXT: v_or_b32_sdwa v20, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: v_or_b32_sdwa v19, v37, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v37, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v37, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v37 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v31, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v19, v19, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: v_or_b32_sdwa v18, v32, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v57, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v57 -; VI-NEXT: v_or_b32_sdwa v18, v18, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v58, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v58 +; VI-NEXT: v_or_b32_sdwa v18, v18, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v56, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v56, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v34, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v34 ; VI-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 @@ -178260,67 +178300,78 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v13, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v36, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v35, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v36 -; VI-NEXT: v_or_b32_sdwa v13, v13, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v26 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v52 -; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v54 -; VI-NEXT: v_or_b32_sdwa v21, v21, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v35 +; VI-NEXT: v_or_b32_sdwa v13, v13, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v25 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x300, v59 +; VI-NEXT: v_or_b32_sdwa v25, v43, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v28, v28, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v12, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v30, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v51, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v51 ; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v59 -; VI-NEXT: v_or_b32_sdwa v25, v25, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v1 +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v63 +; VI-NEXT: v_or_b32_sdwa v26, v26, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v30, v30, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v33, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v57, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v40, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_or_b32_sdwa v30, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v39, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v2 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -178344,15 +178395,14 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v10 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v55 ; VI-NEXT: v_or_b32_sdwa v10, v39, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v53 -; VI-NEXT: v_or_b32_sdwa v27, v28, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v28, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v29, v30, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v52 +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v53 +; VI-NEXT: v_or_b32_sdwa v21, v21, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v27, v27, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -178368,18 +178418,14 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v42 ; VI-NEXT: v_or_b32_sdwa v8, v8, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v11 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v40 -; VI-NEXT: v_or_b32_sdwa v11, v33, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v30, v31, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v17, v17, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v40 +; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v7, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload @@ -178419,19 +178465,29 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v46, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v29, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v46, vcc, 0x300, v46 ; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v2 +; VI-NEXT: v_or_b32_sdwa v29, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 ; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v47, vcc, 3, v32 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x300, v4 ; VI-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 @@ -178498,35 +178554,38 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB93_4: -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v61, v60 -; VI-NEXT: v_mov_b32_e32 v60, v59 +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v61, v59 +; VI-NEXT: v_mov_b32_e32 v46, v57 +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v40, v42 +; VI-NEXT: v_mov_b32_e32 v42, v44 +; VI-NEXT: v_mov_b32_e32 v44, v45 ; VI-NEXT: v_mov_b32_e32 v45, v62 -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v57, v5 +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v47, v4 -; VI-NEXT: v_mov_b32_e32 v63, v3 -; VI-NEXT: v_mov_b32_e32 v53, v28 -; VI-NEXT: v_mov_b32_e32 v43, v27 -; VI-NEXT: v_mov_b32_e32 v55, v26 -; VI-NEXT: v_mov_b32_e32 v41, v24 -; VI-NEXT: v_mov_b32_e32 v54, v22 +; VI-NEXT: v_mov_b32_e32 v54, v3 +; VI-NEXT: v_mov_b32_e32 v52, v28 +; VI-NEXT: v_mov_b32_e32 v63, v27 +; VI-NEXT: v_mov_b32_e32 v50, v26 +; VI-NEXT: v_mov_b32_e32 v34, v24 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_branch .LBB93_2 @@ -178588,18 +178647,18 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v7 ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v11 ; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; GFX9-NEXT: s_waitcnt vmcnt(24) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 ; GFX9-NEXT: s_waitcnt vmcnt(23) @@ -178628,10 +178687,10 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill @@ -178643,7 +178702,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v30 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill @@ -178691,7 +178750,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill @@ -178718,23 +178777,23 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 @@ -178747,48 +178806,49 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v3 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:52 ; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 ; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76 ; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:92 ; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:100 ; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:116 ; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:132 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:140 ; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:148 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:156 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:164 -; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204 ; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:220 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:228 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 ; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:244 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 -; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:260 ; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:276 ; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:284 ; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:308 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:316 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:324 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill @@ -178799,55 +178859,54 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(33) ; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(36) -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(39) +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(40) +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(40) +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: s_waitcnt vmcnt(40) ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: s_waitcnt vmcnt(40) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(40) ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill @@ -178857,7 +178916,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB93_2 @@ -178870,7 +178929,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_or_b32_sdwa v2, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -178907,10 +178966,10 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload @@ -178926,13 +178985,13 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -178940,7 +178999,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload @@ -178981,8 +179040,8 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: v_mov_b32_e32 v52, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_mov_b32_e32 v50, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -179000,16 +179059,16 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_mov_b32_e32 v48, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v1 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_mov_b32_e32 v33, v45 +; GFX9-NEXT: v_mov_b32_e32 v33, v46 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v18, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload @@ -179022,7 +179081,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -179031,7 +179090,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -179039,121 +179098,122 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v22, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v34, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v23, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_mov_b32_e32 v46, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v1, v35, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v24, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v35, v45 -; GFX9-NEXT: v_mov_b32_e32 v45, v61 -; GFX9-NEXT: v_mov_b32_e32 v61, v42 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_mov_b32_e32 v38, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshl_or_b32 v25, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v54, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v54, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v41, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v41, v57 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshl_or_b32 v27, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v60, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v60, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v28, v1, 16, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v59, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v57, v59 ; GFX9-NEXT: v_lshl_or_b32 v29, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v56, v42 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB93_3 ; GFX9-NEXT: .LBB93_2: ; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v33, v45 -; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v33, v46 +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v56, v61 +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: .LBB93_3: ; %Flow ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -179356,7 +179416,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -179416,11 +179476,11 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v48, v46, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v48, v40, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v49, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v49, v46, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 @@ -179455,7 +179515,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: v_or_b32_sdwa v53, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v61 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v62 ; GFX9-NEXT: v_or_b32_sdwa v24, v54, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v24 ; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v48 @@ -179464,7 +179524,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v54, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v45 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v61 ; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 ; GFX9-NEXT: v_or_b32_sdwa v20, v57, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v34, 0x300, v20 @@ -179473,7 +179533,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v26, 3, v56 ; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 -; GFX9-NEXT: v_or_b32_sdwa v21, v32, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v21, v45, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v28, 0x300, v21 ; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v54 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 @@ -181298,212 +181358,207 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:56 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v4 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v31, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v33, v8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v46, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v31, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v20 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v28 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v24 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v13, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v37 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v39 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v55 ; SI-NEXT: v_cvt_f16_f32_e32 v47, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v10, v44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v44 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v61 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v61, v42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v45 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v46 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v3 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v55, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v5 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v40, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v15 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v42, v16 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v16 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v46, v14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v14 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; kill: killed $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr14 @@ -181627,20 +181682,30 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; kill: killed $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 ; SI-NEXT: ; kill: killed $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; kill: killed $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; kill: killed $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; kill: killed $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; kill: killed $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; kill: killed $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; kill: killed $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr14 @@ -181650,407 +181715,400 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; kill: killed $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: v_mov_b32_e32 v45, v46 -; SI-NEXT: v_mov_b32_e32 v46, v6 -; SI-NEXT: v_mov_b32_e32 v6, v5 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; kill: killed $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; kill: killed $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB94_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v16, v46 +; SI-NEXT: v_mov_b32_e32 v42, v40 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v20, v1, v2 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v44, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v46 +; SI-NEXT: v_mov_b32_e32 v46, v33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v41, v15, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v44, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v54, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v32, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v41, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v54, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v53, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v53, v15, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v51, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v51, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; SI-NEXT: v_mov_b32_e32 v31, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v52, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v49, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v50, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v48, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v9 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v39, v5, v14 -; SI-NEXT: v_alignbit_b32 v5, v41, v44, 24 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v52, v15, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v37, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v49, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_mov_b32_e32 v13, v12 +; SI-NEXT: v_mov_b32_e32 v12, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v50, v15, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v38, v5, v14 -; SI-NEXT: v_alignbit_b32 v5, v41, v44, 16 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v35, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v48, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v11 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v36, v5, v14 -; SI-NEXT: v_alignbit_b32 v5, v41, v44, 8 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v39, v15, v14 +; SI-NEXT: v_alignbit_b32 v14, v41, v44, 24 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v30, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v10 +; SI-NEXT: v_or_b32_e32 v37, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v34, v5, v14 -; SI-NEXT: v_alignbit_b32 v5, v53, v54, 24 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v38, v15, v14 +; SI-NEXT: v_alignbit_b32 v14, v41, v44, 16 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v28, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_or_b32_e32 v35, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v29, v5, v14 -; SI-NEXT: v_alignbit_b32 v5, v53, v54, 16 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v36, v15, v14 +; SI-NEXT: v_alignbit_b32 v14, v41, v44, 8 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v30, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v26, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_or_b32_e32 v34, v15, v14 +; SI-NEXT: v_alignbit_b32 v14, v53, v54, 24 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v27, v5, v14 -; SI-NEXT: v_alignbit_b32 v5, v53, v54, 8 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v28, v14, v9 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v29, v14, v9 +; SI-NEXT: v_alignbit_b32 v9, v53, v54, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v24, v47, v14 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v45 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v26, v43, v9 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v25, v5, v14 -; SI-NEXT: v_alignbit_b32 v5, v52, v51, 24 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v45 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v27, v14, v9 +; SI-NEXT: v_alignbit_b32 v9, v53, v54, 8 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v59 -; SI-NEXT: v_or_b32_e32 v22, v58, v5 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v56 -; SI-NEXT: v_or_b32_e32 v23, v57, v5 -; SI-NEXT: v_alignbit_b32 v5, v52, v51, 16 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v56 +; SI-NEXT: v_or_b32_e32 v24, v47, v9 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v43 +; SI-NEXT: v_or_b32_e32 v25, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v9, v52, v51, 24 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v62 -; SI-NEXT: v_or_b32_e32 v20, v61, v5 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 -; SI-NEXT: v_or_b32_e32 v21, v60, v5 -; SI-NEXT: v_alignbit_b32 v5, v52, v51, 8 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v63 -; SI-NEXT: v_or_b32_e32 v18, v40, v5 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v59 +; SI-NEXT: v_or_b32_e32 v22, v58, v9 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v23, v57, v9 +; SI-NEXT: v_alignbit_b32 v9, v52, v51, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 -; SI-NEXT: v_or_b32_e32 v19, v55, v5 -; SI-NEXT: v_alignbit_b32 v5, v50, v49, 24 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_or_b32_e32 v21, v60, v1 +; SI-NEXT: v_alignbit_b32 v1, v52, v51, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v43 -; SI-NEXT: v_or_b32_e32 v16, v1, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 -; SI-NEXT: v_or_b32_e32 v17, v42, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; SI-NEXT: v_or_b32_e32 v18, v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v19, v3, v1 +; SI-NEXT: v_alignbit_b32 v1, v50, v49, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_or_b32_e32 v14, v63, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: v_or_b32_e32 v17, v62, v1 ; SI-NEXT: v_alignbit_b32 v1, v50, v49, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v14, v4, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 -; SI-NEXT: v_or_b32_e32 v15, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_or_b32_e32 v3, v8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_or_b32_e32 v15, v6, v1 ; SI-NEXT: v_alignbit_b32 v1, v50, v49, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v39, v48, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v39, v48, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v39, v48, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v38, v37, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v38, v37, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v38, v37, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v36, v35, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v36, v35, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v36, v35, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v34, v30, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v34, v30, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v34, v30, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v29, v28, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v29, v28, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v29, v28, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v27, v26, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v27, v26, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v27, v26, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v25, v24, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v25, v24, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v25, v24, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v23, v22, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v23, v22, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v23, v22, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v21, v20, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v21, v20, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v21, v20, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v19, v18, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v19, v18, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v19, v18, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v17, v14, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v17, v14, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v17, v16, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v17, v14, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v1, v15, v3, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v17, v16, 8 +; SI-NEXT: v_alignbit_b32 v1, v15, v3, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v15, v14, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v15, v14, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v15, v14, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v15, v3, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v50 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v33, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v16, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v46, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v32, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v31, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v11, 8, 8 +; SI-NEXT: v_mov_b32_e32 v11, v12 +; SI-NEXT: v_mov_b32_e32 v12, v13 +; SI-NEXT: v_mov_b32_e32 v13, v31 +; SI-NEXT: v_mov_b32_e32 v31, v32 +; SI-NEXT: v_mov_b32_e32 v32, v33 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v12, 8, 8 +; SI-NEXT: v_bfe_u32 v1, v32, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v9, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v13, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v12, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v11, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v10, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v55, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v8, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v45, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v7, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v43, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v1, v10, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v56, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v9, 8, 8 +; SI-NEXT: v_mov_b32_e32 v55, v40 +; SI-NEXT: v_mov_b32_e32 v40, v42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v47, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v2, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v40, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v46, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v45, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -182104,202 +182162,209 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: v_mov_b32_e32 v33, v46 +; SI-NEXT: v_mov_b32_e32 v46, v16 +; SI-NEXT: v_mov_b32_e32 v16, v3 +; SI-NEXT: v_bfe_u32 v42, v55, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: .LBB94_2: ; %Flow -; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB94_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v16, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v55 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v55 +; SI-NEXT: v_or_b32_e32 v15, v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v14, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_or_b32_e32 v14, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 -; SI-NEXT: v_or_b32_e32 v15, v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v16, v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; SI-NEXT: v_or_b32_e32 v17, v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v18, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v20, v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v40 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_or_b32_e32 v17, v5, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v18, v4, v5 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: v_or_b32_e32 v19, v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v20, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 ; SI-NEXT: v_or_b32_e32 v21, v1, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v58 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_or_b32_e32 v19, v3, v4 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v22, v2, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; SI-NEXT: v_or_b32_e32 v23, v1, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_or_b32_e32 v24, v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v25, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_or_b32_e32 v25, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v26, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v28, v4, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v27, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v27, v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v29, v1, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 ; SI-NEXT: v_or_b32_e32 v30, v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v34, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -182312,24 +182377,24 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v35, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v11 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v37, v4, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_or_b32_e32 v36, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -182339,34 +182404,36 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 ; SI-NEXT: v_or_b32_e32 v38, v1, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_or_b32_e32 v48, v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v39, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v39, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 @@ -182377,21 +182444,21 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v49, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v51, v4, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_or_b32_e32 v50, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 @@ -182407,30 +182474,30 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 ; SI-NEXT: v_or_b32_e32 v52, v1, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 ; SI-NEXT: v_or_b32_e32 v54, v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v53, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 @@ -182441,274 +182508,279 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v44, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 ; SI-NEXT: v_or_b32_e32 v41, v3, v1 ; SI-NEXT: v_alignbit_b32 v1, v41, v44, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v41, v44, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v41, v44, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v53, v54, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v53, v54, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v53, v54, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v52, v51, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v52, v51, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v52, v51, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v50, v49, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v50, v49, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v50, v49, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v39, v48, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v39, v48, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v39, v48, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v38, v37, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v38, v37, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v38, v37, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v36, v35, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v36, v35, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v36, v35, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v34, v30, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v34, v30, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v34, v30, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v29, v28, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v29, v28, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v29, v28, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v27, v26, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v27, v26, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v27, v26, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v25, v24, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v25, v24, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v25, v24, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v23, v22, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v23, v22, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v23, v22, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v21, v20, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v21, v20, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v21, v20, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v19, v18, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v19, v18, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v19, v18, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v17, v14, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v17, v14, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v17, v16, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v17, v14, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v1, v15, v16, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v17, v16, 8 +; SI-NEXT: v_alignbit_b32 v1, v15, v16, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v15, v14, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v15, v14, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v15, v14, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v15, v16, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v50 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v33, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v46, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v32, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v33, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v31, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v13, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_bfe_u32 v1, v10, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v12, 8, 8 +; SI-NEXT: v_bfe_u32 v1, v32, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v9, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v13, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v12, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v11, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v10, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_bfe_u32 v1, v9, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_bfe_u32 v1, v8, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_bfe_u32 v1, v7, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_bfe_u32 v1, v6, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_bfe_u32 v1, v42, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_bfe_u32 v1, v55, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_bfe_u32 v1, v40, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v46, 8, 8 -; SI-NEXT: v_alignbit_b32 v5, v41, v44, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v5, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v45, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v40, 8, 8 +; SI-NEXT: v_bfe_u32 v42, v55, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: .LBB94_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v46 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182717,14 +182789,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182735,14 +182807,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182751,14 +182823,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v51 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182769,8 +182841,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -182785,14 +182857,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182803,14 +182875,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182819,14 +182891,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182837,30 +182909,32 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182871,14 +182945,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v32 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182887,14 +182961,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182905,14 +182979,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v11 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182921,14 +182995,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182939,14 +183013,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182955,14 +183029,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -182973,30 +183047,32 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -183007,33 +183083,35 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v27 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -183041,30 +183119,32 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v25 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -183075,14 +183155,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -183093,14 +183173,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -183111,14 +183191,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -183129,14 +183209,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -183147,14 +183227,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -183165,14 +183245,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -183183,14 +183263,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v46 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v40 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -183199,14 +183279,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -183217,18 +183297,16 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v42 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v45 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v55 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -183272,54 +183350,56 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 ; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 ; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v29 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v28 ; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v26 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v62 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v61 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; kill: killed $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr31 @@ -183372,6 +183452,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: ; kill: killed $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; kill: killed $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr31 @@ -183415,31 +183496,30 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; kill: killed $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v61 -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v60 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill @@ -183454,770 +183534,773 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: s_cbranch_execz .LBB94_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v16 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v16 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v14 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v12 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v12 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v11 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v10 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v10 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v9 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v8 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v8 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v7 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v6 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v6 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v4 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v4 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v3 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v2 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v2 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v1 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v61 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v62 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v62 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v61 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v60 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v30 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v30 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v29 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v28 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v28 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v27 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v26 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v26 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v25 ; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v25 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v24 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v24 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v23 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v22 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v22 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v21 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v20 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v20 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v19 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v18 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v18 ; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v18 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v17 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[15:16] ; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[13:14] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[11:12] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[9:10] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[7:8] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v31, v33 -; VI-NEXT: v_mov_b32_e32 v33, v43 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v31, v43 ; VI-NEXT: v_lshrrev_b64 v[42:43], 24, v[5:6] -; VI-NEXT: v_mov_b32_e32 v43, v33 -; VI-NEXT: v_mov_b32_e32 v33, v46 -; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[3:4] -; VI-NEXT: v_mov_b32_e32 v46, v33 -; VI-NEXT: v_mov_b32_e32 v33, v53 -; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[1:2] -; VI-NEXT: v_mov_b32_e32 v53, v33 -; VI-NEXT: v_lshrrev_b64 v[32:33], 24, v[60:61] +; VI-NEXT: v_mov_b32_e32 v32, v57 +; VI-NEXT: v_lshrrev_b64 v[56:57], 24, v[3:4] +; VI-NEXT: v_mov_b32_e32 v43, v31 +; VI-NEXT: v_mov_b32_e32 v57, v32 +; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[1:2] +; VI-NEXT: v_lshrrev_b64 v[32:33], 24, v[61:62] ; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[32:33], 24, v[29:30] -; VI-NEXT: v_lshrrev_b64 v[34:35], 24, v[27:28] ; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, v36 +; VI-NEXT: v_lshrrev_b64 v[59:60], 24, v[27:28] +; VI-NEXT: v_mov_b32_e32 v32, v35 ; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[25:26] -; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[23:24] -; VI-NEXT: v_mov_b32_e32 v36, v33 -; VI-NEXT: v_mov_b32_e32 v33, v41 -; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22] -; VI-NEXT: v_mov_b32_e32 v34, v51 +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, v32 +; VI-NEXT: v_mov_b32_e32 v32, v41 +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24] +; VI-NEXT: v_mov_b32_e32 v41, v32 +; VI-NEXT: v_mov_b32_e32 v32, v53 +; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[21:22] +; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v18 +; VI-NEXT: v_mov_b32_e32 v53, v32 +; VI-NEXT: v_mov_b32_e32 v32, v34 +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] +; VI-NEXT: v_mov_b32_e32 v59, v50 +; VI-NEXT: v_mov_b32_e32 v34, v32 +; VI-NEXT: v_mov_b32_e32 v32, v51 ; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[17:18] -; VI-NEXT: v_mov_b32_e32 v41, v33 -; VI-NEXT: v_mov_b32_e32 v33, v31 -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[19:20] -; VI-NEXT: v_mov_b32_e32 v51, v34 +; VI-NEXT: v_mov_b32_e32 v51, v32 ; VI-NEXT: .LBB94_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB94_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v63, 0x200 -; VI-NEXT: v_add_f16_sdwa v31, v18, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, 0x200 +; VI-NEXT: v_add_f16_sdwa v31, v18, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; VI-NEXT: v_add_f16_e32 v18, 0x200, v18 ; VI-NEXT: v_or_b32_e32 v32, v18, v31 -; VI-NEXT: v_add_f16_sdwa v31, v17, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_sdwa v31, v17, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 ; VI-NEXT: v_or_b32_e32 v31, v17, v31 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_add_f16_sdwa v31, v20, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_sdwa v31, v20, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 ; VI-NEXT: v_or_b32_e32 v32, v20, v31 -; VI-NEXT: v_add_f16_sdwa v31, v19, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_sdwa v31, v19, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 ; VI-NEXT: v_or_b32_e32 v31, v19, v31 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_add_f16_sdwa v34, v22, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_sdwa v31, v22, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 ; VI-NEXT: v_or_b32_e32 v32, v22, v31 -; VI-NEXT: v_add_f16_sdwa v31, v21, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_sdwa v31, v21, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 ; VI-NEXT: v_or_b32_e32 v31, v21, v31 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_add_f16_sdwa v31, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_sdwa v31, v24, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 ; VI-NEXT: v_or_b32_e32 v32, v24, v31 -; VI-NEXT: v_add_f16_sdwa v31, v23, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_sdwa v31, v23, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 ; VI-NEXT: v_or_b32_e32 v31, v23, v31 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_add_f16_sdwa v31, v26, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_sdwa v31, v26, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; VI-NEXT: v_add_f16_e32 v26, 0x200, v26 -; VI-NEXT: v_or_b32_e32 v36, v26, v31 -; VI-NEXT: v_add_f16_sdwa v31, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v32, v26, v31 +; VI-NEXT: v_add_f16_sdwa v31, v25, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 -; VI-NEXT: v_or_b32_e32 v35, v25, v31 -; VI-NEXT: v_add_f16_sdwa v31, v28, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v31, v25, v31 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_sdwa v31, v28, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; VI-NEXT: v_add_f16_e32 v28, 0x200, v28 ; VI-NEXT: v_or_b32_e32 v38, v28, v31 -; VI-NEXT: v_add_f16_sdwa v31, v27, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_sdwa v31, v27, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; VI-NEXT: v_add_f16_e32 v27, 0x200, v27 ; VI-NEXT: v_or_b32_e32 v37, v27, v31 -; VI-NEXT: v_add_f16_sdwa v31, v30, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v31, v30, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; VI-NEXT: v_add_f16_e32 v30, 0x200, v30 -; VI-NEXT: v_add_f16_sdwa v32, v29, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v32, v29, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v29, 0x200, v29 -; VI-NEXT: v_or_b32_e32 v49, v30, v31 +; VI-NEXT: v_or_b32_e32 v34, v30, v31 ; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; VI-NEXT: v_or_b32_e32 v48, v29, v31 -; VI-NEXT: v_add_f16_sdwa v31, v61, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v33, v29, v31 +; VI-NEXT: v_add_f16_sdwa v31, v62, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: v_add_f16_e32 v61, 0x200, v61 -; VI-NEXT: v_add_f16_sdwa v32, v60, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v62, 0x200, v62 +; VI-NEXT: v_add_f16_sdwa v32, v61, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_add_f16_e32 v60, 0x200, v60 -; VI-NEXT: v_or_b32_e32 v51, v61, v31 +; VI-NEXT: v_add_f16_e32 v61, 0x200, v61 +; VI-NEXT: v_or_b32_e32 v51, v62, v31 ; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; VI-NEXT: v_or_b32_e32 v50, v60, v31 -; VI-NEXT: v_add_f16_sdwa v31, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_sdwa v47, v2, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v50, v61, v31 ; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 -; VI-NEXT: v_add_f16_sdwa v32, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_add_f16_sdwa v32, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v47 ; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 ; VI-NEXT: v_or_b32_e32 v53, v2, v31 ; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 ; VI-NEXT: v_or_b32_e32 v52, v1, v31 -; VI-NEXT: v_add_f16_sdwa v31, v4, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_sdwa v31, v4, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 -; VI-NEXT: v_add_f16_sdwa v32, v3, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v32, v3, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 ; VI-NEXT: v_or_b32_e32 v46, v4, v31 ; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; VI-NEXT: v_add_f16_sdwa v36, v6, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v45, v3, v31 -; VI-NEXT: v_add_f16_sdwa v31, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 -; VI-NEXT: v_add_f16_sdwa v32, v5, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_add_f16_sdwa v32, v5, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v36 ; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 ; VI-NEXT: v_or_b32_e32 v43, v6, v31 ; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; VI-NEXT: v_add_f16_sdwa v44, v8, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_sdwa v44, v8, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v42, v5, v31 ; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 -; VI-NEXT: v_add_f16_sdwa v32, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v32, v7, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v44 ; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 ; VI-NEXT: v_or_b32_e32 v41, v8, v31 ; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; VI-NEXT: v_add_f16_sdwa v49, v10, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v40, v7, v31 -; VI-NEXT: v_add_f16_sdwa v31, v10, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 -; VI-NEXT: v_add_f16_sdwa v32, v9, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 -; VI-NEXT: v_or_b32_e32 v55, v10, v31 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; VI-NEXT: v_add_f16_sdwa v39, v12, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v54, v9, v31 +; VI-NEXT: v_add_f16_sdwa v35, v9, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v49 +; VI-NEXT: v_add_f16_sdwa v39, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v32, v10, v31 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v35 ; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 -; VI-NEXT: v_add_f16_sdwa v33, v11, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v39 -; VI-NEXT: v_add_f16_sdwa v47, v14, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v32, v12, v31 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v33 +; VI-NEXT: v_add_f16_sdwa v35, v11, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v39 +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v57, v12, v54 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v35 +; VI-NEXT: v_add_f16_sdwa v63, v14, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v56, v11, v54 ; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 -; VI-NEXT: v_add_f16_sdwa v33, v13, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v47 -; VI-NEXT: v_or_b32_e32 v57, v14, v56 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v33 -; VI-NEXT: v_add_f16_sdwa v33, v16, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v48, v13, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v63 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v59, v14, v54 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v48 +; VI-NEXT: v_add_f16_sdwa v48, v16, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v58, v13, v54 ; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 -; VI-NEXT: v_add_f16_sdwa v63, v15, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v33 +; VI-NEXT: v_add_f16_sdwa v60, v15, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v48 ; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 -; VI-NEXT: v_or_b32_e32 v59, v16, v58 -; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v63 -; VI-NEXT: v_or_b32_e32 v58, v15, v58 -; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v59 -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v58 +; VI-NEXT: v_or_b32_e32 v55, v16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v60 +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v54, v15, v54 +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v55 +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v54 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[54:55] +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v59 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v58 ; VI-NEXT: v_lshrrev_b64 v[58:59], 24, v[58:59] -; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v56, v13, v56 -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v57 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v56 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v57 +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v56 ; VI-NEXT: v_lshrrev_b64 v[56:57], 24, v[56:57] -; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v31, v11, v31 +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v31, v9, v31 ; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v32 -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v31 ; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[31:32] -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v55 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v54 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[54:55] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v41 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v40 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[40:41] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v43 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v42 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v46 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v45 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v53 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v52 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v51 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v50 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[50:51] +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[52:53] +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v51 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v50 +; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[50:51] ; VI-NEXT: v_lshrrev_b64 v[42:43], 24, v[42:43] -; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[45:46] -; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[52:53] -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v49 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v48 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[48:49] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v38 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v37 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[37:38] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v36 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v35 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[35:36] -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v54, v39 -; VI-NEXT: v_mov_b32_e32 v37, v44 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_mov_b32_e32 v56, v58 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v34 +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b64 v[56:57], 24, v[45:46] +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v33 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[32:33], 24, v[33:34] +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v38 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v37 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[32:33], 24, v[37:38] +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v57, v35 +; VI-NEXT: v_mov_b32_e32 v45, v44 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v34 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v33 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[32:33], 24, v[33:34] +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v49 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v48 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[48:49] -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v48, v33 +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v34 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v33 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[33:34] +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v41, v39 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v51 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v50 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[50:51] -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v34 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v33 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[33:34] +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v51 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v50 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[50:51] -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v34 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v33 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[33:34] +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v51 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v50 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v32, v33, 8, 8 -; VI-NEXT: v_mov_b32_e32 v33, v47 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v32, v33, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v32, v39, 8, 8 -; VI-NEXT: v_mov_b32_e32 v39, v63 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v32, v48, 8, 8 +; VI-NEXT: v_mov_b32_e32 v48, v63 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v32, v48, 8, 8 ; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[50:51] -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_bfe_u32 v32, v63, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v32, v41, 8, 8 +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v32, v49, 8, 8 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v32, v44, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_bfe_u32 v32, v47, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v44, v32 -; VI-NEXT: v_bfe_u32 v32, v32, 8, 8 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v32, v36, 8, 8 ; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v55, v32 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v49, v47 +; VI-NEXT: v_mov_b32_e32 v44, v36 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v37, v32 ; VI-NEXT: v_bfe_u32 v32, v32, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v32, v36, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v32, v47, 8, 8 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v32, v51, 8, 8 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_bfe_u32 v32, v32, 8, 8 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_bfe_u32 v32, v32, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v32, v58, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v32, v57, 8, 8 ; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v32, v59, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v32, v58, 8, 8 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v32, v55, 8, 8 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v32, v34, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v32, v53, 8, 8 -; VI-NEXT: v_mov_b32_e32 v58, v57 -; VI-NEXT: v_mov_b32_e32 v57, v59 -; VI-NEXT: v_mov_b32_e32 v59, v34 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v32, v41, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v32, v46, 8, 8 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v54, v32 +; VI-NEXT: v_bfe_u32 v59, v32, 8, 8 ; VI-NEXT: .LBB94_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v31, 8, v31 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v32 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v32 +; VI-NEXT: v_or_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v32 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v2, v2, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v52 +; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v32 +; VI-NEXT: v_or_b32_sdwa v2, v2, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v34, v32, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v45 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v56 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v50 -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v59 +; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 -; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v33 +; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v40 -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v52 +; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v49 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v40 +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v35 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -184227,10 +184310,10 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -184241,21 +184324,23 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -184266,8 +184351,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -184279,27 +184364,27 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -186577,7 +186662,11 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v60, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 @@ -186599,459 +186688,489 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v33, v28 +; SI-NEXT: v_mov_b32_e32 v28, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 -; SI-NEXT: v_mov_b32_e32 v59, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v42, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v61, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v13, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v16, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v14, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v21, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v19, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v24, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v23, s28 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v51 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v48, v52 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(13) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v51, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v54 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v53, v40 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v53, v55 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v40, v44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v40 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v52, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v44, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, s21 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v45 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f16_f32_e32 v45, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v57 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s27 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v47 +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB95_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v13, v13, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v44 +; SI-NEXT: v_or_b32_e32 v47, v13, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 ; SI-NEXT: v_or_b32_e32 v55, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 -; SI-NEXT: v_or_b32_e32 v57, v16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 -; SI-NEXT: v_or_b32_e32 v17, v14, v11 +; SI-NEXT: v_or_b32_e32 v13, v16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v18 +; SI-NEXT: v_or_b32_e32 v46, v14, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v21, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_or_b32_e32 v17, v21, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 ; SI-NEXT: v_or_b32_e32 v16, v19, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v24 -; SI-NEXT: v_or_b32_e32 v19, v23, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 -; SI-NEXT: v_or_b32_e32 v47, v60, v11 +; SI-NEXT: v_mov_b32_e32 v24, v32 +; SI-NEXT: v_or_b32_e32 v22, v23, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v24 +; SI-NEXT: v_or_b32_e32 v14, v60, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_or_b32_e32 v43, v42, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v33 -; SI-NEXT: v_or_b32_e32 v14, v63, v11 +; SI-NEXT: v_mov_b32_e32 v21, v6 +; SI-NEXT: v_or_b32_e32 v23, v42, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: v_or_b32_e32 v60, v63, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 -; SI-NEXT: v_or_b32_e32 v42, v58, v11 +; SI-NEXT: v_or_b32_e32 v43, v3, v11 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v32, v31 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_mov_b32_e32 v31, v10 +; SI-NEXT: v_mov_b32_e32 v10, v9 +; SI-NEXT: v_mov_b32_e32 v9, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v60, v12, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 -; SI-NEXT: v_or_b32_e32 v22, v2, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v6, v3, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_or_b32_e32 v12, v59, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52 -; SI-NEXT: v_or_b32_e32 v12, v46, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v24, v2, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_or_b32_e32 v4, v4, v11 +; SI-NEXT: v_or_b32_e32 v63, v5, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v58 +; SI-NEXT: v_or_b32_e32 v42, v15, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 ; SI-NEXT: v_or_b32_e32 v34, v34, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v6 -; SI-NEXT: v_or_b32_e32 v3, v59, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 +; SI-NEXT: v_or_b32_e32 v5, v56, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 -; SI-NEXT: v_or_b32_e32 v59, v56, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v18 -; SI-NEXT: v_or_b32_e32 v6, v62, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v7 -; SI-NEXT: v_or_b32_e32 v62, v25, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 -; SI-NEXT: v_or_b32_e32 v2, v27, v11 +; SI-NEXT: v_or_b32_e32 v56, v62, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_or_b32_e32 v3, v30, v11 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v8 -; SI-NEXT: v_or_b32_e32 v25, v28, v11 +; SI-NEXT: v_or_b32_e32 v58, v25, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v33 +; SI-NEXT: v_or_b32_e32 v26, v26, v11 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v36, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_or_b32_e32 v23, v35, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v18, v39, v11 -; SI-NEXT: v_mov_b32_e32 v36, v2 -; SI-NEXT: v_mov_b32_e32 v35, v1 -; SI-NEXT: v_alignbit_b32 v1, v55, v13, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v25, v28, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v35 +; SI-NEXT: v_or_b32_e32 v29, v36, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v62 +; SI-NEXT: v_or_b32_e32 v27, v38, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v39 +; SI-NEXT: v_or_b32_e32 v19, v37, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v4 +; SI-NEXT: v_or_b32_e32 v7, v50, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v49 +; SI-NEXT: v_mov_b32_e32 v36, v5 +; SI-NEXT: v_or_b32_e32 v5, v52, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v2 +; SI-NEXT: v_or_b32_e32 v39, v51, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 +; SI-NEXT: v_mov_b32_e32 v38, v3 +; SI-NEXT: v_or_b32_e32 v3, v54, v11 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v54, v7 +; SI-NEXT: v_alignbit_b32 v7, v55, v47, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v55, v13, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v55, v47, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v55, v13, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v55, v47, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v17, v57, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v46, v13, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v17, v57, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v46, v13, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v17, v57, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v46, v13, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v16, v21, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v16, v17, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v16, v21, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v16, v17, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v16, v21, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v16, v17, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v47, v19, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v14, v22, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v47, v19, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v14, v22, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v47, v19, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v14, v22, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v14, v43, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v60, v23, 24 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v14, v43, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v60, v23, 16 +; SI-NEXT: v_or_b32_e32 v50, v44, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 +; SI-NEXT: v_mov_b32_e32 v40, v43 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v14, v43, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v60, v23, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v60, v42, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v6, v40, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v60, v42, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v6, v40, 16 +; SI-NEXT: v_mov_b32_e32 v53, v12 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v60, v42, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v6, v40, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v24, v22, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v63, v53, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v24, v22, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v63, v53, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v24, v22, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v34, v42, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v34, v4, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v34, v42, 16 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v7, v34, v42, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v34, v4, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v56, v36, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v34, v4, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v56, v36, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v59, v3, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v56, v36, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v59, v3, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v58, v38, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v59, v3, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v26, 8, v34 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 -; SI-NEXT: v_or_b32_e32 v61, v50, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v49 -; SI-NEXT: v_or_b32_e32 v2, v48, v11 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v49, v6 +; SI-NEXT: v_alignbit_b32 v7, v58, v38, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v62, v49, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v58, v38, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v62, v49, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v25, v26, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v62, v49, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v25, v26, 16 +; SI-NEXT: v_or_b32_e32 v15, v45, v11 +; SI-NEXT: v_mov_b32_e32 v45, v27 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v25, v36, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v25, v26, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v25, v36, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v45, v29, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v25, v36, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v45, v29, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v23, v35, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v45, v29, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v23, v35, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v54, v19, 24 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v41 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v23, v35, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v54, v19, 16 +; SI-NEXT: v_alignbit_b32 v11, v54, v19, 8 +; SI-NEXT: v_or_b32_e32 v12, v57, v12 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v61, v18, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v7, v19 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v29, v39, v5, 24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v61, v18, 16 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 -; SI-NEXT: v_or_b32_e32 v58, v54, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v51 -; SI-NEXT: v_or_b32_e32 v6, v53, v11 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v1, v61, v18, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v11, v39, v5, 16 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_alignbit_b32 v5, v39, v5, 8 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v42, v50, v3, 24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v58, v2, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v1, v58, v2, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v5, v50, v3, 16 +; SI-NEXT: v_mov_b32_e32 v59, v3 +; SI-NEXT: v_alignbit_b32 v57, v50, v3, 8 +; SI-NEXT: v_alignbit_b32 v3, v12, v15, 24 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v55 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v55 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v46 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v14 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v60 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v60 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v6 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v63 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v34 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v62 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v56 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v58 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v23 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v25 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v61 -; SI-NEXT: v_or_b32_e32 v54, v40, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v45 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v58 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v54 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v39 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v50 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v44, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v12 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v20, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v3, v20, 8, 8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v15, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v3, v18, 8, 8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v29, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v3, v48, 8, 8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v33, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v3, v24, 8, 8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v32, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v3, v21, 8, 8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v31, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v3, v32, 8, 8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v10, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v3, v31, 8, 8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v9, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v3, v10, 8, 8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v8, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v3, v9, 8, 8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v5, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v3, v8, 8, 8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v38, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v3, v30, 8, 8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v48, 8, 8 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v3, v62, 8, 8 +; SI-NEXT: v_mov_b32_e32 v44, v1 +; SI-NEXT: v_bfe_u32 v1, v1, 8, 8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v37, 8, 8 -; SI-NEXT: v_or_b32_e32 v11, v45, v11 +; SI-NEXT: v_bfe_u32 v3, v4, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v52, 8, 8 -; SI-NEXT: v_alignbit_b32 v28, v58, v2, 24 -; SI-NEXT: v_alignbit_b32 v2, v54, v6, 24 -; SI-NEXT: v_alignbit_b32 v39, v54, v6, 16 -; SI-NEXT: v_alignbit_b32 v40, v54, v6, 8 -; SI-NEXT: v_alignbit_b32 v27, v12, v11, 24 -; SI-NEXT: v_alignbit_b32 v56, v12, v11, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v30, v12, v11, 8 -; SI-NEXT: v_mov_b32_e32 v20, v29 -; SI-NEXT: v_mov_b32_e32 v15, v33 -; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_bfe_u32 v1, v41, 8, 8 +; SI-NEXT: v_alignbit_b32 v27, v63, v53, 16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v26, v12, v15, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v15 +; SI-NEXT: v_alignbit_b32 v43, v12, v15, 8 +; SI-NEXT: v_mov_b32_e32 v52, v20 +; SI-NEXT: v_mov_b32_e32 v20, v18 +; SI-NEXT: v_mov_b32_e32 v18, v48 +; SI-NEXT: v_mov_b32_e32 v15, v24 +; SI-NEXT: v_mov_b32_e32 v28, v21 +; SI-NEXT: v_mov_b32_e32 v21, v23 +; SI-NEXT: v_mov_b32_e32 v23, v22 +; SI-NEXT: v_mov_b32_e32 v22, v17 +; SI-NEXT: v_mov_b32_e32 v17, v13 +; SI-NEXT: v_mov_b32_e32 v13, v47 ; SI-NEXT: v_mov_b32_e32 v32, v31 ; SI-NEXT: v_mov_b32_e32 v31, v10 ; SI-NEXT: v_mov_b32_e32 v10, v9 -; SI-NEXT: v_mov_b32_e32 v9, v7 -; SI-NEXT: v_bfe_u32 v29, v7, 8, 8 -; SI-NEXT: v_mov_b32_e32 v7, v8 -; SI-NEXT: v_mov_b32_e32 v8, v5 -; SI-NEXT: v_mov_b32_e32 v44, v37 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v48, v30 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v2 +; SI-NEXT: v_bfe_u32 v30, v2, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v37, v62 ; SI-NEXT: s_branch .LBB95_3 ; SI-NEXT: .LBB95_2: -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v52, v20 +; SI-NEXT: v_mov_b32_e32 v20, v18 +; SI-NEXT: v_mov_b32_e32 v18, v48 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v44, v1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -187197,391 +187316,390 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_mov_b32_e32 v20, v29 -; SI-NEXT: v_mov_b32_e32 v15, v33 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v32, v31 -; SI-NEXT: v_mov_b32_e32 v31, v10 -; SI-NEXT: v_mov_b32_e32 v10, v9 -; SI-NEXT: v_mov_b32_e32 v9, v7 -; SI-NEXT: v_mov_b32_e32 v7, v8 -; SI-NEXT: v_mov_b32_e32 v8, v5 -; SI-NEXT: v_mov_b32_e32 v44, v37 +; SI-NEXT: v_mov_b32_e32 v15, v32 +; SI-NEXT: v_mov_b32_e32 v28, v6 +; SI-NEXT: v_mov_b32_e32 v33, v31 +; SI-NEXT: v_mov_b32_e32 v32, v10 +; SI-NEXT: v_mov_b32_e32 v31, v9 +; SI-NEXT: v_mov_b32_e32 v10, v8 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v3, v2 ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: .LBB95_3: ; %Flow -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, v44 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v5, v8 -; SI-NEXT: v_mov_b32_e32 v6, v7 -; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: v_mov_b32_e32 v8, v10 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mov_b32_e32 v47, v48 +; SI-NEXT: v_mov_b32_e32 v62, v51 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v2, v3 ; SI-NEXT: v_mov_b32_e32 v9, v31 ; SI-NEXT: v_mov_b32_e32 v31, v33 -; SI-NEXT: v_mov_b32_e32 v44, v15 -; SI-NEXT: v_mov_b32_e32 v33, v20 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v50, v2 -; SI-NEXT: v_mov_b32_e32 v53, v40 -; SI-NEXT: v_mov_b32_e32 v40, v28 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_mov_b32_e32 v2, v48 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v33, v15 +; SI-NEXT: v_mov_b32_e32 v15, v18 +; SI-NEXT: v_mov_b32_e32 v18, v20 +; SI-NEXT: v_mov_b32_e32 v20, v52 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v3, v41 +; SI-NEXT: v_mov_b32_e32 v35, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v11, v27 -; SI-NEXT: v_mov_b32_e32 v38, v30 -; SI-NEXT: v_mov_b32_e32 v27, v52 -; SI-NEXT: v_mov_b32_e32 v30, v29 -; SI-NEXT: v_mov_b32_e32 v29, v26 +; SI-NEXT: v_mov_b32_e32 v4, v8 +; SI-NEXT: v_mov_b32_e32 v8, v10 +; SI-NEXT: v_mov_b32_e32 v10, v32 +; SI-NEXT: v_mov_b32_e32 v32, v28 +; SI-NEXT: v_mov_b32_e32 v28, v43 +; SI-NEXT: v_mov_b32_e32 v43, v42 +; SI-NEXT: v_mov_b32_e32 v42, v5 +; SI-NEXT: v_mov_b32_e32 v24, v29 +; SI-NEXT: v_mov_b32_e32 v29, v1 +; SI-NEXT: v_mov_b32_e32 v1, v19 +; SI-NEXT: v_mov_b32_e32 v19, v7 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_mov_b32_e32 v5, v37 ; SI-NEXT: s_cbranch_vccnz .LBB95_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v2 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v6 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v35, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v49 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v41 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v34 -; SI-NEXT: v_or_b32_e32 v56, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v4 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v13, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_or_b32_e32 v36, v14, v13 +; SI-NEXT: v_or_b32_e32 v59, v14, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_or_b32_e32 v54, v14, v16 +; SI-NEXT: v_or_b32_e32 v55, v14, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v52, v17, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v17, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v58, v17, v19 -; SI-NEXT: v_alignbit_b32 v40, v58, v52, 24 +; SI-NEXT: v_or_b32_e32 v17, v17, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v11, v21, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v61, v21, v22 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v54, v21, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v16, v23, v22 +; SI-NEXT: v_or_b32_e32 v36, v23, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: v_or_b32_e32 v45, v23, v25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v48, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v6 +; SI-NEXT: v_or_b32_e32 v13, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v47 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_or_b32_e32 v53, v26, v27 +; SI-NEXT: v_or_b32_e32 v14, v26, v27 ; SI-NEXT: v_mov_b32_e32 v26, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v7 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_or_b32_e32 v62, v28, v27 +; SI-NEXT: v_or_b32_e32 v58, v28, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v27, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_or_b32_e32 v59, v29, v34 +; SI-NEXT: v_or_b32_e32 v56, v29, v34 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v51 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v30, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v29, v16 +; SI-NEXT: v_mov_b32_e32 v62, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v7 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_or_b32_e32 v3, v30, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v44 +; SI-NEXT: v_or_b32_e32 v7, v30, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v35, v13 +; SI-NEXT: v_mov_b32_e32 v28, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v48 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_or_b32_e32 v4, v34, v30 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v32 ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_mov_b32_e32 v30, v10 -; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v41, v34, v30 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v9 ; SI-NEXT: v_or_b32_e32 v34, v35, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v46 -; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_or_b32_e32 v22, v35, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_or_b32_e32 v53, v35, v36 ; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v10 -; SI-NEXT: v_mov_b32_e32 v35, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v63, v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v38 +; SI-NEXT: v_or_b32_e32 v40, v39, v37 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_alignbit_b32 v27, v63, v53, 16 +; SI-NEXT: v_alignbit_b32 v30, v63, v53, 8 +; SI-NEXT: v_bfe_u32 v35, v2, 8, 8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_or_b32_e32 v24, v37, v36 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v38 -; SI-NEXT: v_or_b32_e32 v42, v39, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v45 -; SI-NEXT: v_mov_b32_e32 v36, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v39 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_or_b32_e32 v60, v37, v39 +; SI-NEXT: v_or_b32_e32 v6, v37, v39 +; SI-NEXT: v_mov_b32_e32 v39, v17 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v48 ; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v49 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_alignbit_b32 v39, v54, v29, 16 -; SI-NEXT: v_or_b32_e32 v43, v48, v37 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v44 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_or_b32_e32 v14, v49, v48 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v24, v39, v1, 24 +; SI-NEXT: v_or_b32_e32 v21, v48, v37 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v28, v14, v43, 8 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_or_b32_e32 v60, v49, v48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -187590,20 +187708,20 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_or_b32_e32 v19, v48, v37 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v23, v48, v37 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v33 -; SI-NEXT: v_or_b32_e32 v47, v49, v37 +; SI-NEXT: v_or_b32_e32 v14, v49, v37 ; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v48 -; SI-NEXT: v_or_b32_e32 v21, v50, v37 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v22, v50, v37 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -187613,26 +187731,27 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_or_b32_e32 v16, v37, v49 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v48 ; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_alignbit_b32 v50, v54, v29, 24 -; SI-NEXT: v_or_b32_e32 v57, v48, v37 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v50, v55 +; SI-NEXT: v_alignbit_b32 v43, v50, v59, 24 +; SI-NEXT: v_alignbit_b32 v42, v50, v59, 16 +; SI-NEXT: v_or_b32_e32 v17, v48, v37 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v18 +; SI-NEXT: v_alignbit_b32 v57, v50, v59, 8 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_or_b32_e32 v17, v49, v48 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v46, v49, v48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_mov_b32_e32 v49, v53 -; SI-NEXT: v_alignbit_b32 v53, v54, v29, 8 ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 @@ -187643,573 +187762,575 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v13, v48, v37 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v20 ; SI-NEXT: v_or_b32_e32 v55, v51, v37 -; SI-NEXT: v_alignbit_b32 v10, v55, v13, 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v55, v13, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v55, v13, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v55, v13, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v17, v57, 24 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v55, v13, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v17, v57, 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v46, v17, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v17, v57, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v46, v17, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v16, v21, 24 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v46, v17, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v16, v21, 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v16, v22, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v16, v21, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v16, v22, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v47, v19, 24 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v16, v22, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v47, v19, 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v14, v23, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v47, v19, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v14, v23, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v14, v43, 24 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v14, v23, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v14, v43, 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v60, v21, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v60, v42, 24 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v60, v21, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v60, v42, 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v60, v21, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v60, v42, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v6, v40, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v24, v22, 24 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v6, v40, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v24, v22, 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v6, v40, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v24, v22, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v63, v53, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v34, v4, 24 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v34, v41, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v34, v4, 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v34, v41, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v34, v4, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v34, v41, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v59, v3, 24 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v56, v36, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v59, v3, 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v56, v36, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v59, v3, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v56, v36, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v62, v49, 24 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v58, v38, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v62, v49, 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v58, v38, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v62, v49, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v58, v38, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v25, v36, 24 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v25, v26, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v25, v36, 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v25, v26, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v7, v25, v26, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v25, v36, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v45, v28, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v23, v35, 24 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v45, v28, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v7, v45, v28, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v23, v35, 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v54, v19, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v23, v35, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v54, v19, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v61, v11, 24 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v54, v19, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v61, v11, 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v10, v61, v11, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v39, v1, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v58, v52, 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v10, v58, v52, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v39, v1, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, v56 -; SI-NEXT: v_alignbit_b32 v11, v12, v10, 24 -; SI-NEXT: v_alignbit_b32 v56, v12, v10, 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v38, v12, v10, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v55 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v55 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v46 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v17 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v14 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v47 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v60 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v14 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v6 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v60 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v63 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v24 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v34 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v59 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v56 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v62 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v58 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v25 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v25 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v23 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v45 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v61 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v54 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v58 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v39 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v54 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v50 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v12 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v12 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v10, v20, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v7, v20, 8, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v10, v18, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v7, v18, 8, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v10, v15, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v7, v15, 8, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v10, v33, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v7, v33, 8, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v10, v44, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v7, v32, 8, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v10, v31, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v7, v31, 8, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v10, v30, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v7, v10, 8, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v10, v9, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v7, v9, 8, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v10, v8, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v7, v8, 8, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v10, v6, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v7, v29, 8, 8 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v10, v5, 8, 8 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_bfe_u32 v10, v26, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v7, v47, 8, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v10, v2, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v7, v5, 8, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v10, v1, 8, 8 -; SI-NEXT: v_alignbit_b32 v48, v55, v13, 24 -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v34 -; SI-NEXT: v_bfe_u32 v30, v7, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v7, v4, 8, 8 +; SI-NEXT: v_alignbit_b32 v26, v12, v11, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v10, v27, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v26, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v28, v12, v11, 8 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_bfe_u32 v7, v29, 8, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v7, v3, 8, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: .LBB95_5: ; %end ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v37, 0xff, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v48 ; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v37, v37, v51 ; SI-NEXT: v_and_b32_e32 v37, 0xffff, v37 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v51, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v51, 0xff, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v7 ; SI-NEXT: v_or_b32_e32 v51, v52, v51 ; SI-NEXT: v_or_b32_e32 v37, v37, v51 ; SI-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v37, 0xff, v55 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v37, v37, v51 ; SI-NEXT: v_and_b32_e32 v37, 0xffff, v37 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v10 +; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v7 ; SI-NEXT: v_or_b32_e32 v20, v48, v20 ; SI-NEXT: v_or_b32_e32 v20, v37, v20 ; SI-NEXT: v_add_i32_e32 v37, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v20, v37, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xff, v57 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v20, v20, v37 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v37, 0xff, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v37, 0xff, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v10 +; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v7 ; SI-NEXT: v_or_b32_e32 v37, v48, v37 ; SI-NEXT: v_or_b32_e32 v20, v20, v37 ; SI-NEXT: v_add_i32_e32 v37, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v20, v37, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xff, v17 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v46 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v20, v20, v37 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v10 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v7 ; SI-NEXT: v_or_b32_e32 v18, v37, v18 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 ; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v18, v20, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xff, v21 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v18, v18, v20 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xff, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xff, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v10 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v7 ; SI-NEXT: v_or_b32_e32 v20, v37, v20 ; SI-NEXT: v_or_b32_e32 v18, v18, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v18, v20, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v18, v18, v20 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v7 ; SI-NEXT: v_or_b32_e32 v15, v20, v15 ; SI-NEXT: v_or_b32_e32 v15, v18, v15 ; SI-NEXT: v_add_i32_e32 v18, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v23 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v15, v18 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xff, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xff, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v7 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 ; SI-NEXT: v_or_b32_e32 v15, v15, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v47 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v15, v18 ; SI-NEXT: v_and_b32_e32 v18, 0xff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v7 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 ; SI-NEXT: v_or_b32_e32 v15, v15, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v43 -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v28 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v15, v18 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xff, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xff, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v7 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 ; SI-NEXT: v_or_b32_e32 v15, v15, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v60 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v15, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v44 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v32 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v7 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 ; SI-NEXT: v_or_b32_e32 v15, v15, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v42 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v40 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v15, v18 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xff, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xff, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v7 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 ; SI-NEXT: v_or_b32_e32 v15, v15, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v60 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v15, v18 ; SI-NEXT: v_and_b32_e32 v18, 0xff, v31 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v7 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 ; SI-NEXT: v_or_b32_e32 v15, v15, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v30 ; SI-NEXT: v_or_b32_e32 v15, v15, v18 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xff, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xff, v27 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v7 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 ; SI-NEXT: v_or_b32_e32 v15, v15, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v15, v18 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v7 ; SI-NEXT: v_or_b32_e32 v10, v18, v10 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: v_or_b32_e32 v10, v15, v10 ; SI-NEXT: v_add_i32_e32 v15, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v10, v15, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v41 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v10, v10, v15 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xff, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v7 ; SI-NEXT: v_or_b32_e32 v15, v18, v15 ; SI-NEXT: v_or_b32_e32 v10, v10, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v10, v15, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v10, 0xff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v10, v10, v15 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v7 ; SI-NEXT: v_or_b32_e32 v9, v15, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_add_i32_e32 v10, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v3 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v7 ; SI-NEXT: v_or_b32_e32 v10, v15, v10 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: v_add_i32_e32 v10, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v59 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v56 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v7 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v38 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v7 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v62 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v58 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v4 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v7 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v30 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v36 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 @@ -188222,24 +188343,26 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v4 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v35 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; SI-NEXT: v_or_b32_e32 v6, v6, v7 @@ -188254,25 +188377,24 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v4 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v45 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v19 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 @@ -188287,17 +188409,13 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v61 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v54 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 @@ -188305,11 +188423,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v40 -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 @@ -188324,27 +188443,23 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v58 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v39 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v35 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; SI-NEXT: v_or_b32_e32 v2, v5, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v53 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v50 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v57 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v39 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v42 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v43 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 @@ -188352,7 +188467,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v50 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 @@ -188364,33 +188479,33 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v38 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v28 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v56 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v26 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v27 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -188417,8 +188532,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: v_writelane_b32 v63, s30, 0 ; VI-NEXT: v_writelane_b32 v63, s31, 1 @@ -188648,112 +188763,117 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: s_lshr_b32 s46, s45, 16 ; VI-NEXT: v_mov_b32_e32 v7, 0x200 ; VI-NEXT: v_add_f16_e32 v1, s46, v7 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_add_f16_e32 v2, s45, v7 ; VI-NEXT: s_lshr_b32 s45, s44, 16 ; VI-NEXT: v_or_b32_e32 v23, v2, v1 ; VI-NEXT: v_add_f16_e32 v1, s45, v7 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_add_f16_e32 v2, s44, v7 ; VI-NEXT: s_lshr_b32 s44, s43, 16 ; VI-NEXT: v_or_b32_e32 v22, v2, v1 ; VI-NEXT: v_add_f16_e32 v1, s44, v7 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_add_f16_e32 v2, s43, v7 ; VI-NEXT: s_lshr_b32 s43, s42, 16 ; VI-NEXT: v_or_b32_e32 v25, v2, v1 ; VI-NEXT: v_add_f16_e32 v1, s43, v7 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_add_f16_e32 v2, s42, v7 ; VI-NEXT: s_lshr_b32 s42, s41, 16 ; VI-NEXT: v_or_b32_e32 v24, v2, v1 ; VI-NEXT: v_add_f16_e32 v1, s42, v7 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_add_f16_e32 v2, s41, v7 ; VI-NEXT: s_lshr_b32 s41, s40, 16 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v27, v2, v1 ; VI-NEXT: v_add_f16_e32 v1, s41, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_add_f16_e32 v2, s40, v7 ; VI-NEXT: s_lshr_b32 s40, s15, 16 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_f16_e32 v53, s40, v7 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v26, v2, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; VI-NEXT: v_add_f16_e32 v1, s40, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_add_f16_e32 v2, s15, v7 ; VI-NEXT: s_lshr_b32 s15, s14, 16 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v29, v2, v1 ; VI-NEXT: v_add_f16_e32 v1, s15, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_add_f16_e32 v2, s14, v7 ; VI-NEXT: s_lshr_b32 s14, s13, 16 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_f16_e32 v43, s14, v7 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v28, v2, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; VI-NEXT: v_add_f16_e32 v1, s14, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_add_f16_e32 v2, s13, v7 ; VI-NEXT: s_lshr_b32 s13, s12, 16 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v6, v2, v1 ; VI-NEXT: v_add_f16_e32 v1, s13, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_add_f16_e32 v2, s12, v7 ; VI-NEXT: s_lshr_b32 s12, s11, 16 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_f16_e32 v37, s12, v7 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v5, v2, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; VI-NEXT: v_add_f16_e32 v1, s12, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_add_f16_e32 v2, s11, v7 ; VI-NEXT: s_lshr_b32 s11, s10, 16 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v31, v2, v1 ; VI-NEXT: v_add_f16_e32 v1, s11, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_add_f16_e32 v2, s10, v7 ; VI-NEXT: s_lshr_b32 s10, s9, 16 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_f16_e32 v52, s10, v7 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v30, v2, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; VI-NEXT: v_add_f16_e32 v1, s10, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_add_f16_e32 v2, s9, v7 ; VI-NEXT: s_lshr_b32 s9, s8, 16 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v4, v2, v1 ; VI-NEXT: v_add_f16_e32 v1, s9, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_add_f16_e32 v2, s8, v7 ; VI-NEXT: s_lshr_b32 s8, s7, 16 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_f16_e32 v50, s8, v7 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v3, v2, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; VI-NEXT: v_add_f16_e32 v1, s8, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_add_f16_e32 v2, s7, v7 ; VI-NEXT: s_lshr_b32 s7, s6, 16 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v2, v2, v1 ; VI-NEXT: v_add_f16_e32 v1, s7, v7 ; VI-NEXT: v_add_f16_e32 v8, s6, v7 ; VI-NEXT: s_lshr_b32 s6, s17, 16 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_add_f16_e32 v36, s6, v7 -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v1, v8, v1 ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 ; VI-NEXT: v_add_f16_e32 v9, s17, v7 @@ -188761,12 +188881,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_e32 v33, v9, v8 ; VI-NEXT: v_add_f16_e32 v8, s6, v7 ; VI-NEXT: s_lshr_b32 s6, s19, 16 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_add_f16_e32 v9, s16, v7 ; VI-NEXT: v_add_f16_e32 v38, s6, v7 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v32, v9, v8 ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 ; VI-NEXT: v_add_f16_e32 v9, s19, v7 @@ -188774,12 +188894,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_e32 v21, v9, v8 ; VI-NEXT: v_add_f16_e32 v8, s6, v7 ; VI-NEXT: s_lshr_b32 s6, s21, 16 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_add_f16_e32 v9, s18, v7 ; VI-NEXT: v_add_f16_e32 v61, s6, v7 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v20, v9, v8 ; VI-NEXT: s_lshr_b32 s7, s20, 16 ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v61 @@ -188787,12 +188907,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_e32 v35, v9, v8 ; VI-NEXT: v_add_f16_e32 v8, s7, v7 ; VI-NEXT: s_lshr_b32 s6, s23, 16 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_add_f16_e32 v9, s20, v7 ; VI-NEXT: v_add_f16_e32 v45, s6, v7 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v34, v9, v8 ; VI-NEXT: s_lshr_b32 s7, s22, 16 ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v45 @@ -188800,12 +188920,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_e32 v19, v9, v8 ; VI-NEXT: v_add_f16_e32 v8, s7, v7 ; VI-NEXT: s_lshr_b32 s6, s25, 16 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_add_f16_e32 v9, s22, v7 ; VI-NEXT: v_add_f16_e32 v47, s6, v7 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v18, v9, v8 ; VI-NEXT: s_lshr_b32 s7, s24, 16 ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 @@ -188813,12 +188933,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_e32 v16, v9, v8 ; VI-NEXT: v_add_f16_e32 v8, s7, v7 ; VI-NEXT: s_lshr_b32 s6, s27, 16 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_add_f16_e32 v9, s24, v7 ; VI-NEXT: v_add_f16_e32 v57, s6, v7 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v15, v9, v8 ; VI-NEXT: s_lshr_b32 s7, s26, 16 ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v57 @@ -188826,12 +188946,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_e32 v13, v9, v8 ; VI-NEXT: v_add_f16_e32 v8, s7, v7 ; VI-NEXT: s_lshr_b32 s6, s29, 16 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_add_f16_e32 v9, s26, v7 ; VI-NEXT: v_add_f16_e32 v59, s6, v7 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v12, v9, v8 ; VI-NEXT: s_lshr_b32 s7, s28, 16 ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v59 @@ -188841,96 +188961,80 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_add_f16_e32 v8, s7, v7 ; VI-NEXT: s_lshr_b32 s7, s4, 16 ; VI-NEXT: v_add_f16_e32 v51, s6, v7 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_add_f16_e32 v9, s28, v7 ; VI-NEXT: v_add_f16_e32 v54, s5, v7 -; VI-NEXT: v_add_f16_e32 v11, s7, v7 +; VI-NEXT: v_add_f16_e32 v53, s7, v7 ; VI-NEXT: v_add_f16_e32 v55, s4, v7 ; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v51 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v9, v9, v8 ; VI-NEXT: v_or_b32_e32 v8, v54, v7 -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v53 ; VI-NEXT: v_or_b32_e32 v7, v55, v7 ; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v8 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v7 ; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8] -; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v10 -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v9 -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v13 -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v2 -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v1 ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[1:2] -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v4 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[3:4] ; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v16 ; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[15:16] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v13 ; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[12:13] -; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v18 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v18 ; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[18:19] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[3:4] -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v19 -; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v35 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v35 ; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[34:35] ; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v23 ; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[22:23] -; VI-NEXT: v_bfe_u32 v23, v50, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v23, v52, 8, 8 +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v31 -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v23, v37, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v23, v43, 8, 8 -; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[9:10] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v23, v53, 8, 8 -; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v12 -; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v24 -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[24:25] -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[30:31] -; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v20 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v10 +; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[9:10] +; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v20 ; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[20:21] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 ; VI-NEXT: v_lshrrev_b64 v[4:5], 24, v[5:6] -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v21 +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v21 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v32 ; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[32:33] -; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v6 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v29 ; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[28:29] +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v26 ; VI-NEXT: v_lshrrev_b64 v[8:9], 24, v[26:27] -; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v29 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v24 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[24:25] +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v33 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 ; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v27 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v26 ; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v25 ; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v22 ; VI-NEXT: v_bfe_u32 v25, v51, 8, 8 @@ -188942,11 +189046,31 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_bfe_u32 v22, v38, 8, 8 ; VI-NEXT: v_bfe_u32 v2, v36, 8, 8 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_bfe_u32 v26, v50, 8, 8 -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_bfe_u32 v23, v23, 8, 8 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_bfe_u32 v24, v24, 8, 8 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_bfe_u32 v26, v26, 8, 8 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_bfe_u32 v23, v23, 8, 8 +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_bfe_u32 v23, v23, 8, 8 +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_bfe_u32 v23, v23, 8, 8 +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_bfe_u32 v23, v23, 8, 8 +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_bfe_u32 v23, v23, 8, 8 ; VI-NEXT: s_branch .LBB95_5 ; VI-NEXT: .LBB95_3: ; VI-NEXT: ; implicit-def: $sgpr46 @@ -189106,133 +189230,136 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: s_branch .LBB95_2 ; VI-NEXT: .LBB95_4: ; VI-NEXT: v_mov_b32_e32 v1, s44 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s45 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s42 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s43 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s40 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s41 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s14 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s15 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s12 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s13 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s10 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s11 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s16 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s18 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s19 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s20 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s21 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s22 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s23 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s24 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s25 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s26 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s27 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s28 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s29 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s71 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s70 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s69 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s68 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s67 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s66 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s65 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s64 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s55 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s54 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s87 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s86 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s85 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s84 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s53 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s52 ; VI-NEXT: v_readlane_b32 s6, v62, 0 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 1 ; VI-NEXT: v_mov_b32_e32 v36, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 2 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 3 ; VI-NEXT: v_mov_b32_e32 v38, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 4 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 5 ; VI-NEXT: v_mov_b32_e32 v61, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 6 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 7 ; VI-NEXT: v_mov_b32_e32 v45, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 9 ; VI-NEXT: v_mov_b32_e32 v47, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 10 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 11 -; VI-NEXT: v_mov_b32_e32 v57, s6 -; VI-NEXT: v_readlane_b32 s6, v62, 12 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: v_readlane_b32 s6, v62, 13 ; VI-NEXT: v_mov_b32_e32 v55, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 16 -; VI-NEXT: v_mov_b32_e32 v59, s6 -; VI-NEXT: v_readlane_b32 s6, v62, 14 +; VI-NEXT: v_mov_b32_e32 v57, s6 +; VI-NEXT: v_readlane_b32 s6, v62, 12 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 17 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_mov_b32_e32 v22, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 18 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 19 ; VI-NEXT: v_mov_b32_e32 v15, s4 @@ -189247,77 +189374,74 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_readlane_b32 s4, v62, 24 ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 25 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 26 -; VI-NEXT: v_mov_b32_e32 v46, s4 +; VI-NEXT: v_mov_b32_e32 v58, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 27 -; VI-NEXT: v_mov_b32_e32 v41, s4 +; VI-NEXT: v_mov_b32_e32 v56, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 28 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s80 -; VI-NEXT: v_mov_b32_e32 v60, s4 +; VI-NEXT: v_mov_b32_e32 v44, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 29 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s81 -; VI-NEXT: v_mov_b32_e32 v40, s4 +; VI-NEXT: v_mov_b32_e32 v49, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 30 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s82 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s80 ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 31 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s81 +; VI-NEXT: v_mov_b32_e32 v37, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 32 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s82 +; VI-NEXT: v_mov_b32_e32 v42, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 33 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 34 ; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 34 +; VI-NEXT: v_mov_b32_e32 v41, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 35 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v43, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 36 ; VI-NEXT: v_mov_b32_e32 v48, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 37 -; VI-NEXT: v_mov_b32_e32 v49, s4 +; VI-NEXT: v_mov_b32_e32 v60, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 38 -; VI-NEXT: v_mov_b32_e32 v44, s4 +; VI-NEXT: v_mov_b32_e32 v46, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 39 -; VI-NEXT: v_mov_b32_e32 v42, s4 +; VI-NEXT: v_mov_b32_e32 v50, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 40 -; VI-NEXT: v_mov_b32_e32 v56, s4 +; VI-NEXT: v_mov_b32_e32 v40, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 41 -; VI-NEXT: v_mov_b32_e32 v58, s4 +; VI-NEXT: v_mov_b32_e32 v52, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 42 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 43 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 44 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 45 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 46 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 47 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s78 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_readlane_b32 s4, v62, 48 ; VI-NEXT: v_mov_b32_e32 v31, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 49 -; VI-NEXT: v_mov_b32_e32 v30, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s78 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_readlane_b32 s4, v62, 50 ; VI-NEXT: v_mov_b32_e32 v33, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 51 @@ -189327,20 +189451,20 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_readlane_b32 s4, v62, 53 ; VI-NEXT: v_mov_b32_e32 v28, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 54 +; VI-NEXT: v_readlane_b32 s6, v62, 13 ; VI-NEXT: v_mov_b32_e32 v34, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 55 +; VI-NEXT: v_mov_b32_e32 v59, s6 +; VI-NEXT: v_readlane_b32 s6, v62, 14 ; VI-NEXT: v_mov_b32_e32 v9, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 56 ; VI-NEXT: v_mov_b32_e32 v3, s88 +; VI-NEXT: v_mov_b32_e32 v53, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 15 ; VI-NEXT: v_mov_b32_e32 v21, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 57 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v50, s70 -; VI-NEXT: v_mov_b32_e32 v43, s54 -; VI-NEXT: v_mov_b32_e32 v37, s86 -; VI-NEXT: v_mov_b32_e32 v52, s84 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v51, s6 ; VI-NEXT: v_mov_b32_e32 v54, s5 ; VI-NEXT: v_mov_b32_e32 v23, s83 @@ -189362,8 +189486,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_mov_b32_e32 v11, s38 ; VI-NEXT: v_mov_b32_e32 v14, s48 ; VI-NEXT: .LBB95_5: ; %end -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v58 +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v52, 8, v52 ; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -189404,31 +189528,31 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_readlane_b32 s31, v63, 1 ; VI-NEXT: v_readlane_b32 s30, v63, 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v58, v53, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v52, v30, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v20, v53, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v58, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v30, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v52, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v46 -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v58 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v20, v46, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v52, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v20, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v2, v20, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v56 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v40 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v2, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v19, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v2, v19, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v41 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v56 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v22 @@ -189436,36 +189560,36 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v2, v2, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v19, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v2, v19, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v42 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v50 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v18, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v2, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v44 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v44 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v46 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v17 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v49 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v15 @@ -189473,9 +189597,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v49 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v60 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v16 @@ -189484,7 +189608,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v39 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -189493,8 +189617,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v48 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -189504,10 +189628,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v37 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 @@ -189515,11 +189637,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v43 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v10 @@ -189528,10 +189648,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v42 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v27 @@ -189539,18 +189657,14 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v41 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v7 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v25 ; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -189559,8 +189673,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -189570,17 +189684,19 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v35 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 -; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v34 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -189590,8 +189706,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -189601,8 +189717,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -189612,8 +189728,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v28 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -189623,86 +189739,90 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v33 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v32 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v31 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v30 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -189710,28 +189830,29 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -189739,15 +189860,15 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -189768,8 +189889,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -190185,7 +190306,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v36 ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v13 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v36 ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v22 @@ -190193,7 +190314,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v35 ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v22 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v38 ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v22 @@ -190492,7 +190613,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: v_readlane_b32 s4, v62, 22 ; GFX9-NEXT: v_mov_b32_e32 v60, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 23 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v17, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 24 ; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill @@ -190500,7 +190621,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: v_readlane_b32 s4, v62, 25 ; GFX9-NEXT: v_mov_b32_e32 v23, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 26 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v17, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 27 ; GFX9-NEXT: v_mov_b32_e32 v59, s4 @@ -190770,14 +190891,14 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -192149,19 +192270,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:392 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 @@ -192187,431 +192308,439 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v14 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v16 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v12 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v22 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v20 -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v30 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v28 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v8 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v28 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 ; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:360 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:332 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:360 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v9 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v5 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v13 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v19 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v21 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v27 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v17 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v29 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v23 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v27 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v32 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v31 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v32 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:300 +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v18 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v29 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v31 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:180 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v33 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v33 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v34 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 +; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v10 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v34 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:220 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:192 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:216 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:252 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v9 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:284 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:256 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v11 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v11 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:292 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v9 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v11 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:348 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:320 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v7 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:344 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:340 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:336 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:332 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:356 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:380 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:352 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:352 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:376 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:372 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:368 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:364 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:364 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v16 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:388 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:384 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v10 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v18 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:388 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:384 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v11 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v8 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v31, 8, v15 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v8 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v11 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v17 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v7 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v12 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 -; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v13 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:48 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v15 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -192652,19 +192781,26 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v12, v1, v2 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v7, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v42, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) @@ -192673,7 +192809,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v5, v2, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -192681,488 +192817,444 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v7, v16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v26, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v11, v2, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v40, v1, v2 +; SI-NEXT: v_or_b32_e32 v42, v1, v2 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v10, v24, v1 +; SI-NEXT: v_or_b32_e32 v39, v24, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v15, v2, v1 +; SI-NEXT: v_or_b32_e32 v17, v2, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v26, v1, v2 +; SI-NEXT: v_or_b32_e32 v55, v1, v2 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v18, v25, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v19, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v30, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v28, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v21, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v27, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v29, v1, v2 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v33, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v30, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v27, v2, v1 +; SI-NEXT: v_or_b32_e32 v53, v2, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v19, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v38, v1, v2 +; SI-NEXT: v_or_b32_e32 v33, v2, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v52, v1, v2 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v43, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v15, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v1, v2 +; SI-NEXT: v_or_b32_e32 v38, v2, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v45, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v55, v1, v6 +; SI-NEXT: v_or_b32_e32 v47, v1, v2 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v17, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v14, v1, v6 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v44, v1, v6 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v45, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v57, v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v25, v1, v6 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v59, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v47, v1, v6 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v62, v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v44, v1, v3 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v58, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v16, v1, v6 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v59, v1, v6 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v8, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v62, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v24, v1, v6 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v6, v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v13, v1, v6 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v3, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v6, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v16, v1, v12 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v23, v1, v8 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v24, v12, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v36, v1, v8 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_mov_b32_e32 v1, v32 +; SI-NEXT: v_or_b32_e32 v32, v10, v12 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v8, v8, v35 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_mov_b32_e32 v10, v34 +; SI-NEXT: v_or_b32_e32 v34, v22, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v35, v37, v22 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v36, v12, v35 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v37, v51, v22 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v35, v37, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v51, v22, v63 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v34, v56, v22 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v37, v51, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v56, v60, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v54 -; SI-NEXT: v_or_b32_e32 v54, v22, v4 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v41 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v9, v9, v22 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v57 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v51, v12, v60 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v14, v31, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v31, v61, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v54, v12, v23 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v23, v25, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v13, v13, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v28, v22, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v32, v22, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v25, v22, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v39, v22, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v40, v22, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v41, v22, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v57, v22, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v46, v22, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v60, v22, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v63, v22, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v61, v22, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v22, v1 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v55 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v1, v1, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v48, v22, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v10, v22, v10 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v46, v22, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v49, v22, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v48, v22, v48 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v50, v22, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v56, v22, v49 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v49, v22, v49 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v50, v22, v50 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: v_or_b32_e32 v53, v22, v53 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v58, v22, v58 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: v_or_b32_e32 v3, v22, v3 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v61 -; SI-NEXT: v_mov_b32_e32 v61, v42 -; SI-NEXT: v_or_b32_e32 v31, v22, v31 -; SI-NEXT: v_or_b32_e32 v22, v12, v61 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v28 -; SI-NEXT: v_or_b32_e32 v43, v12, v5 +; SI-NEXT: v_or_b32_e32 v9, v22, v9 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v52 +; SI-NEXT: v_or_b32_e32 v52, v22, v63 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v22, v7, v63 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v7, v5 ; SI-NEXT: v_alignbit_b32 v5, v22, v5, 16 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v12, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v25 ; SI-NEXT: v_or_b32_e32 v7, v7, v11 -; SI-NEXT: v_and_b32_e32 v32, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v32, v32, v59 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v25, v39 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_or_b32_e32 v5, v5, v26 ; SI-NEXT: v_alignbit_b32 v11, v5, v11, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v42, v11, v25 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v40 -; SI-NEXT: v_or_b32_e32 v42, v11, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v40, v11, v15 -; SI-NEXT: v_alignbit_b32 v11, v42, v15, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v26 -; SI-NEXT: v_or_b32_e32 v26, v11, v18 +; SI-NEXT: v_or_b32_e32 v40, v11, v17 +; SI-NEXT: v_alignbit_b32 v11, v42, v17, 16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v55, v11, v18 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v41 -; SI-NEXT: v_or_b32_e32 v39, v11, v19 -; SI-NEXT: v_alignbit_b32 v11, v26, v19, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v39, v11, v20 +; SI-NEXT: v_alignbit_b32 v11, v55, v20, 16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: v_mov_b32_e32 v41, v28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v28, v11, v30 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v57 -; SI-NEXT: v_or_b32_e32 v11, v11, v21 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v11, v28, v21, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v28, v11, v41 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v46 +; SI-NEXT: v_or_b32_e32 v20, v11, v27 +; SI-NEXT: v_alignbit_b32 v11, v28, v27, 16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v29, v11, v33 +; SI-NEXT: v_or_b32_e32 v29, v11, v30 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v60 -; SI-NEXT: v_or_b32_e32 v21, v11, v27 -; SI-NEXT: v_alignbit_b32 v11, v29, v27, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v27, v11, v53 +; SI-NEXT: v_alignbit_b32 v11, v29, v53, 16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v11, v33 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v11, v11, v43 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v11, v19, v43, 16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v11, v11, v38 +; SI-NEXT: v_alignbit_b32 v1, v11, v45, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v19, v11, v38 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v63 -; SI-NEXT: v_or_b32_e32 v27, v11, v52 -; SI-NEXT: v_alignbit_b32 v11, v19, v52, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v15, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v2 -; SI-NEXT: v_alignbit_b32 v1, v11, v55, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v15, v57, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v15, v1, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 +; SI-NEXT: v_or_b32_e32 v17, v1, v4 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v44 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v15, v44, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v1, v1, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v45 -; SI-NEXT: v_or_b32_e32 v17, v1, v25 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 -; SI-NEXT: v_or_b32_e32 v1, v1, v47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v17, v62, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v17, v47, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v16 -; SI-NEXT: v_alignbit_b32 v32, v1, v59, 16 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v32, 0xffff, v62 -; SI-NEXT: v_or_b32_e32 v59, v6, v23 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v20 -; SI-NEXT: v_or_b32_e32 v62, v32, v24 -; SI-NEXT: v_and_b32_e32 v32, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v50, v6, v36 -; SI-NEXT: v_alignbit_b32 v6, v59, v36, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 -; SI-NEXT: v_or_b32_e32 v47, v6, v35 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v49, v6, v37 -; SI-NEXT: v_alignbit_b32 v6, v47, v37, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v45, v6, v34 -; SI-NEXT: v_or_b32_e32 v48, v3, v56 -; SI-NEXT: v_alignbit_b32 v3, v45, v56, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v44, v3, v4 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v31 -; SI-NEXT: v_or_b32_e32 v3, v3, v9 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 -; SI-NEXT: v_mov_b32_e32 v14, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v8 +; SI-NEXT: v_or_b32_e32 v10, v10, v21 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v1, v21, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v62, v6, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v23 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -193346,133 +193438,175 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: v_or_b32_e32 v6, v6, v16 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v62, v16, 16 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v24 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: v_or_b32_e32 v59, v6, v32 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v50 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: v_or_b32_e32 v56, v6, v34 +; SI-NEXT: v_alignbit_b32 v6, v59, v34, 16 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v36 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: v_or_b32_e32 v47, v6, v35 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v58 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: v_or_b32_e32 v50, v6, v37 +; SI-NEXT: v_alignbit_b32 v6, v47, v37, 16 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v51 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: v_or_b32_e32 v45, v6, v14 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v9 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: v_or_b32_e32 v49, v6, v31 +; SI-NEXT: v_alignbit_b32 v6, v45, v31, 16 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v54 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: v_or_b32_e32 v44, v6, v23 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v52 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: v_or_b32_e32 v46, v32, v13 -; SI-NEXT: v_alignbit_b32 v13, v62, v13, 16 -; SI-NEXT: v_alignbit_b32 v6, v44, v9, 16 +; SI-NEXT: v_or_b32_e32 v48, v6, v13 +; SI-NEXT: v_alignbit_b32 v6, v44, v13, 16 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v61 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v34 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: .LBB96_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB96_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v41 -; SI-NEXT: v_or_b32_e32 v1, v31, v1 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_or_b32_e32 v2, v9, v2 +; SI-NEXT: v_or_b32_e32 v2, v13, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v54 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v57 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v46 +; SI-NEXT: v_or_b32_e32 v2, v23, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_or_b32_e32 v3, v25, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v14 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v56, v5 -; SI-NEXT: v_mov_b32_e32 v30, v16 +; SI-NEXT: v_or_b32_e32 v5, v31, v5 ; SI-NEXT: s_mov_b32 s7, 0x3000000 -; SI-NEXT: v_mov_b32_e32 v31, v24 +; SI-NEXT: v_mov_b32_e32 v30, v24 ; SI-NEXT: v_add_i32_e32 v44, vcc, s7, v2 -; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v1 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v31, vcc, s7, v1 +; SI-NEXT: v_mov_b32_e32 v48, v31 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_or_b32_e32 v3, v9, v3 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_or_b32_e32 v4, v61, v4 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 @@ -193481,41 +193615,27 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v60, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v32, vcc, s7, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v48, v32 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v4, v63, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v45, vcc, s7, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v45 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_or_b32_e32 v5, v53, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v33, vcc, s7, v5 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v33, vcc, s7, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v49, v33 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v60, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v5, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v45, vcc, s7, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v58, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 @@ -193523,13 +193643,16 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v35, vcc, s7, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v47, vcc, s7, v6 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v50, v35 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193538,16 +193661,16 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v7 -; SI-NEXT: v_mov_b32_e32 v50, v6 +; SI-NEXT: v_mov_b32_e32 v56, v6 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193556,15 +193679,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v59, vcc, s7, v8 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193573,16 +193696,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v9 -; SI-NEXT: v_mov_b32_e32 v46, v8 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193591,15 +193713,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v62, vcc, s7, v10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193608,15 +193730,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v11 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193625,15 +193747,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v12 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193642,45 +193764,48 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v13 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v15 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193689,9 +193814,10 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 @@ -193706,30 +193832,30 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_or_b32_e32 v18, v18, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v23 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: v_or_b32_e32 v19, v19, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v19 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v17, v20, v17 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 @@ -193741,13 +193867,14 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_or_b32_e32 v20, v20, v17 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v20 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v17, v21, v17 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193756,15 +193883,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_or_b32_e32 v21, v21, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v21 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v17, v22, v17 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193773,32 +193900,31 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_or_b32_e32 v22, v22, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v22 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v22 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v17, v23, v17 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 ; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v26, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: v_or_b32_e32 v23, v23, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v26, v25 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v23 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: v_or_b32_e32 v24, v24, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v24 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193807,16 +193933,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v25, v25, v17 ; SI-NEXT: v_or_b32_e32 v2, v25, v2 -; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v14 -; SI-NEXT: v_mov_b32_e32 v14, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v20 +; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193826,10 +193951,10 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 @@ -193846,8 +193971,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_add_i32_e32 v39, vcc, s7, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 @@ -193860,16 +193985,17 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v26, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v26, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v55, vcc, s7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v55 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -193882,8 +194008,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_add_i32_e32 v40, vcc, s7, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 @@ -193896,30 +194022,29 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v31, v3 +; SI-NEXT: v_or_b32_e32 v3, v30, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_add_i32_e32 v42, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v42 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v59 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 @@ -193932,28 +194057,27 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v30, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v59 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_or_b32_e32 v2, v32, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v43, vcc, s7, v2 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload @@ -193962,7 +194086,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193972,236 +194096,239 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v2 -; SI-NEXT: v_alignbit_b32 v2, v22, v43, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v22, v12, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v2, v5, v7, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v2, v42, v40, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v26, v39, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v28, v18, 16 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v29, v21, 16 +; SI-NEXT: v_alignbit_b32 v2, v55, v39, 16 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v19, v27, 16 +; SI-NEXT: v_alignbit_b32 v2, v28, v20, 16 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v11, v16, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v2, v29, v27, 16 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v15, v13, 16 +; SI-NEXT: v_alignbit_b32 v2, v19, v16, 16 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v17, v10, 16 +; SI-NEXT: v_alignbit_b32 v2, v11, v14, 16 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v1, v9, 16 +; SI-NEXT: v_alignbit_b32 v2, v15, v13, 16 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v62, v8, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v2, v17, v10, 16 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v59, v6, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v2, v1, v9, 16 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v47, v33, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v2, v62, v8, 16 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v45, v32, 16 +; SI-NEXT: v_alignbit_b32 v2, v59, v6, 16 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v44, v14, 16 +; SI-NEXT: v_alignbit_b32 v2, v47, v35, 16 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v2, v45, v33, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v2, v44, v31, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v17 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v45 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v44 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill ; SI-NEXT: .LBB96_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v12 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v9 ; SI-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v58 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v57 +; SI-NEXT: v_or_b32_e32 v3, v3, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v43 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v42 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v36 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v39 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v53 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v52 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v27 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v38 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v26 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -194213,9 +194340,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -194227,9 +194354,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v50 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -194237,13 +194364,13 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v47 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -194251,19 +194378,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v45 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 @@ -198476,8 +198603,8 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill @@ -198495,21 +198622,21 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:332 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:328 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:324 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:320 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:308 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:304 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:300 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:296 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:292 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:332 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:304 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:300 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:288 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:268 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:264 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:260 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:256 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:244 @@ -198519,22 +198646,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:236 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:232 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:228 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:224 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:212 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:208 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:204 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:204 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:200 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:196 -; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:196 ; SI-NEXT: v_writelane_b32 v63, s30, 0 -; SI-NEXT: v_writelane_b32 v62, s28, 0 -; SI-NEXT: v_writelane_b32 v62, s25, 1 -; SI-NEXT: v_writelane_b32 v62, s24, 2 -; SI-NEXT: v_writelane_b32 v62, s23, 3 -; SI-NEXT: v_writelane_b32 v62, s22, 4 -; SI-NEXT: v_writelane_b32 v62, s21, 5 -; SI-NEXT: v_writelane_b32 v62, s18, 6 -; SI-NEXT: v_writelane_b32 v62, s16, 7 ; SI-NEXT: v_writelane_b32 v63, s31, 1 ; SI-NEXT: v_writelane_b32 v63, s34, 2 ; SI-NEXT: v_writelane_b32 v63, s35, 3 @@ -198547,17 +198665,29 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v63, s50, 10 ; SI-NEXT: v_writelane_b32 v63, s51, 11 ; SI-NEXT: v_writelane_b32 v63, s52, 12 +; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; SI-NEXT: v_writelane_b32 v63, s53, 13 +; SI-NEXT: v_writelane_b32 v62, s28, 0 ; SI-NEXT: v_writelane_b32 v63, s54, 14 +; SI-NEXT: v_writelane_b32 v62, s27, 1 ; SI-NEXT: v_writelane_b32 v63, s55, 15 +; SI-NEXT: v_writelane_b32 v62, s26, 2 ; SI-NEXT: v_writelane_b32 v63, s64, 16 +; SI-NEXT: v_writelane_b32 v62, s25, 3 ; SI-NEXT: v_writelane_b32 v63, s65, 17 +; SI-NEXT: v_writelane_b32 v62, s24, 4 ; SI-NEXT: v_writelane_b32 v63, s66, 18 +; SI-NEXT: v_writelane_b32 v62, s23, 5 ; SI-NEXT: v_writelane_b32 v63, s67, 19 +; SI-NEXT: v_writelane_b32 v62, s22, 6 ; SI-NEXT: v_writelane_b32 v63, s68, 20 +; SI-NEXT: v_writelane_b32 v62, s21, 7 ; SI-NEXT: v_writelane_b32 v63, s69, 21 +; SI-NEXT: v_writelane_b32 v62, s20, 8 ; SI-NEXT: v_writelane_b32 v63, s70, 22 +; SI-NEXT: v_writelane_b32 v62, s18, 9 ; SI-NEXT: v_writelane_b32 v63, s71, 23 +; SI-NEXT: v_writelane_b32 v62, s16, 10 ; SI-NEXT: v_writelane_b32 v63, s80, 24 ; SI-NEXT: v_writelane_b32 v63, s81, 25 ; SI-NEXT: v_writelane_b32 v63, s82, 26 @@ -198569,254 +198699,252 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v63, s96, 32 ; SI-NEXT: v_writelane_b32 v63, s97, 33 ; SI-NEXT: v_writelane_b32 v63, s98, 34 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v21, v5 ; SI-NEXT: v_writelane_b32 v63, s99, 35 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, v26 ; SI-NEXT: v_readfirstlane_b32 s15, v16 -; SI-NEXT: v_readfirstlane_b32 s18, v25 +; SI-NEXT: v_readfirstlane_b32 s21, v25 ; SI-NEXT: v_readfirstlane_b32 s43, v15 ; SI-NEXT: v_readfirstlane_b32 s42, v24 ; SI-NEXT: v_readfirstlane_b32 s44, v23 -; SI-NEXT: v_readfirstlane_b32 s49, v12 -; SI-NEXT: v_readfirstlane_b32 s8, v11 -; SI-NEXT: v_readfirstlane_b32 s53, v20 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s4, v34 -; SI-NEXT: v_writelane_b32 v62, s4, 8 -; SI-NEXT: v_readfirstlane_b32 s4, v38 -; SI-NEXT: v_writelane_b32 v62, s4, 9 -; SI-NEXT: v_readfirstlane_b32 s4, v49 -; SI-NEXT: v_writelane_b32 v62, s4, 10 -; SI-NEXT: v_readfirstlane_b32 s4, v50 +; SI-NEXT: v_readfirstlane_b32 s4, v33 ; SI-NEXT: v_writelane_b32 v62, s4, 11 -; SI-NEXT: v_readfirstlane_b32 s79, v52 -; SI-NEXT: v_readfirstlane_b32 s88, v54 -; SI-NEXT: v_readfirstlane_b32 s4, v55 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:192 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:180 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:176 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 +; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: v_writelane_b32 v62, s4, 12 +; SI-NEXT: v_readfirstlane_b32 s52, v37 +; SI-NEXT: v_readfirstlane_b32 s82, v48 +; SI-NEXT: v_readfirstlane_b32 s4, v53 +; SI-NEXT: v_readfirstlane_b32 s79, v50 +; SI-NEXT: v_readfirstlane_b32 s88, v52 +; SI-NEXT: v_writelane_b32 v62, s4, 13 +; SI-NEXT: v_readfirstlane_b32 s77, v55 +; SI-NEXT: v_readfirstlane_b32 s4, v41 +; SI-NEXT: v_readfirstlane_b32 s35, v42 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:164 ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:148 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:144 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:140 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:136 -; SI-NEXT: v_writelane_b32 v62, s4, 12 -; SI-NEXT: v_readfirstlane_b32 s77, v41 -; SI-NEXT: v_readfirstlane_b32 s4, v42 -; SI-NEXT: v_readfirstlane_b32 s94, v31 -; SI-NEXT: v_readfirstlane_b32 s70, v32 -; SI-NEXT: v_readfirstlane_b32 s51, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:136 +; SI-NEXT: v_readfirstlane_b32 s16, v31 +; SI-NEXT: v_readfirstlane_b32 s26, v32 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s37, v45 -; SI-NEXT: v_readfirstlane_b32 s24, v56 +; SI-NEXT: v_readfirstlane_b32 s76, v45 +; SI-NEXT: v_readfirstlane_b32 s66, v56 ; SI-NEXT: v_readfirstlane_b32 s7, v57 ; SI-NEXT: v_readfirstlane_b32 s92, v58 -; SI-NEXT: v_readfirstlane_b32 s28, v59 +; SI-NEXT: v_readfirstlane_b32 s27, v59 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 -; SI-NEXT: v_readfirstlane_b32 s35, v43 -; SI-NEXT: v_readfirstlane_b32 s55, v46 -; SI-NEXT: v_readfirstlane_b32 s68, v35 -; SI-NEXT: v_readfirstlane_b32 s87, v37 +; SI-NEXT: v_readfirstlane_b32 s51, v44 +; SI-NEXT: v_readfirstlane_b32 s55, v47 +; SI-NEXT: v_readfirstlane_b32 s6, v35 +; SI-NEXT: v_readfirstlane_b32 s98, v36 +; SI-NEXT: v_readfirstlane_b32 s18, v38 ; SI-NEXT: v_readfirstlane_b32 s67, v39 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s74, v53 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 -; SI-NEXT: v_readfirstlane_b32 s85, v48 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 -; SI-NEXT: v_writelane_b32 v62, s4, 13 -; SI-NEXT: v_readfirstlane_b32 s98, v40 +; SI-NEXT: v_readfirstlane_b32 s34, v54 ; SI-NEXT: v_readfirstlane_b32 s69, v51 -; SI-NEXT: v_readfirstlane_b32 s21, v36 -; SI-NEXT: v_readfirstlane_b32 s40, v19 -; SI-NEXT: v_readfirstlane_b32 s23, v28 -; SI-NEXT: v_readfirstlane_b32 s34, v27 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v21, v13 -; SI-NEXT: v_mov_b32_e32 v13, v5 -; SI-NEXT: v_readfirstlane_b32 s97, v29 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 +; SI-NEXT: v_readfirstlane_b32 s87, v40 +; SI-NEXT: v_readfirstlane_b32 s86, v49 +; SI-NEXT: v_writelane_b32 v62, s4, 14 +; SI-NEXT: v_writelane_b32 v62, s17, 15 +; SI-NEXT: v_writelane_b32 v62, s15, 16 +; SI-NEXT: v_writelane_b32 v62, s21, 17 +; SI-NEXT: v_writelane_b32 v62, s43, 18 +; SI-NEXT: v_writelane_b32 v62, s42, 19 +; SI-NEXT: v_writelane_b32 v62, s44, 20 +; SI-NEXT: v_readfirstlane_b32 s53, v12 +; SI-NEXT: v_readfirstlane_b32 s23, v11 +; SI-NEXT: v_readfirstlane_b32 s8, v20 +; SI-NEXT: v_readfirstlane_b32 s48, v19 +; SI-NEXT: v_readfirstlane_b32 s63, v28 +; SI-NEXT: v_readfirstlane_b32 s95, v27 +; SI-NEXT: v_mov_b32_e32 v29, v13 +; SI-NEXT: v_readfirstlane_b32 s97, v26 ; SI-NEXT: v_readfirstlane_b32 s80, v18 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 ; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v22 -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v30 ; SI-NEXT: v_readfirstlane_b32 s96, v17 +; SI-NEXT: v_readfirstlane_b32 s65, v10 ; SI-NEXT: v_readfirstlane_b32 s64, v9 -; SI-NEXT: v_readfirstlane_b32 s25, v8 +; SI-NEXT: v_readfirstlane_b32 s68, v8 ; SI-NEXT: v_readfirstlane_b32 s83, v7 ; SI-NEXT: v_readfirstlane_b32 s84, v4 ; SI-NEXT: v_readfirstlane_b32 s93, v3 -; SI-NEXT: v_readfirstlane_b32 s76, v1 -; SI-NEXT: v_readfirstlane_b32 s58, v38 -; SI-NEXT: v_readfirstlane_b32 s65, v49 -; SI-NEXT: v_readfirstlane_b32 s62, v54 -; SI-NEXT: v_readfirstlane_b32 s81, v44 -; SI-NEXT: v_readfirstlane_b32 s71, v47 -; SI-NEXT: v_readfirstlane_b32 s38, v60 -; SI-NEXT: v_readfirstlane_b32 s86, v61 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:220 +; SI-NEXT: v_readfirstlane_b32 s90, v2 +; SI-NEXT: v_readfirstlane_b32 s11, v1 +; SI-NEXT: v_readfirstlane_b32 s59, v37 +; SI-NEXT: v_readfirstlane_b32 s94, v50 +; SI-NEXT: v_readfirstlane_b32 s39, v53 +; SI-NEXT: v_readfirstlane_b32 s81, v43 +; SI-NEXT: v_readfirstlane_b32 s71, v46 +; SI-NEXT: v_readfirstlane_b32 s85, v60 +; SI-NEXT: v_readfirstlane_b32 s89, v61 +; SI-NEXT: v_readfirstlane_b32 s49, v33 +; SI-NEXT: v_readfirstlane_b32 s70, v34 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s90, v50 -; SI-NEXT: v_readfirstlane_b32 s31, v52 -; SI-NEXT: v_readfirstlane_b32 s4, v55 +; SI-NEXT: v_readfirstlane_b32 s74, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:220 +; SI-NEXT: v_readfirstlane_b32 s91, v52 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 -; SI-NEXT: v_readfirstlane_b32 s72, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:316 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72 -; SI-NEXT: v_readfirstlane_b32 s82, v56 -; SI-NEXT: v_readfirstlane_b32 s95, v57 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:336 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 +; SI-NEXT: v_readfirstlane_b32 s37, v56 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s39, v58 -; SI-NEXT: v_readfirstlane_b32 s56, v59 -; SI-NEXT: v_readfirstlane_b32 s57, v41 +; SI-NEXT: v_readfirstlane_b32 s38, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72 +; SI-NEXT: v_readfirstlane_b32 s25, v57 +; SI-NEXT: v_readfirstlane_b32 s56, v58 +; SI-NEXT: v_readfirstlane_b32 s57, v55 +; SI-NEXT: v_readfirstlane_b32 s58, v41 ; SI-NEXT: v_readfirstlane_b32 s36, v42 -; SI-NEXT: v_readfirstlane_b32 s73, v45 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:284 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:252 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 -; SI-NEXT: v_readfirstlane_b32 s16, v34 -; SI-NEXT: v_readfirstlane_b32 s48, v32 -; SI-NEXT: v_readfirstlane_b32 s52, v33 -; SI-NEXT: v_writelane_b32 v62, s4, 14 +; SI-NEXT: v_readfirstlane_b32 s40, v45 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:284 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:252 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 +; SI-NEXT: v_readfirstlane_b32 s75, v32 ; SI-NEXT: v_readfirstlane_b32 s47, v35 -; SI-NEXT: v_readfirstlane_b32 s60, v37 -; SI-NEXT: v_readfirstlane_b32 s61, v39 -; SI-NEXT: v_readfirstlane_b32 s89, v43 +; SI-NEXT: v_writelane_b32 v62, s56, 21 +; SI-NEXT: v_writelane_b32 v62, s49, 22 +; SI-NEXT: v_readfirstlane_b32 s72, v38 +; SI-NEXT: v_readfirstlane_b32 s73, v39 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s99, v46 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:312 +; SI-NEXT: v_readfirstlane_b32 s22, v44 +; SI-NEXT: v_readfirstlane_b32 s99, v47 +; SI-NEXT: v_writelane_b32 v62, s53, 23 +; SI-NEXT: v_writelane_b32 v62, s70, 24 +; SI-NEXT: v_writelane_b32 v62, s23, 25 +; SI-NEXT: v_writelane_b32 v62, s57, 26 +; SI-NEXT: v_readfirstlane_b32 s54, v51 +; SI-NEXT: v_readfirstlane_b32 s50, v54 +; SI-NEXT: v_readfirstlane_b32 s31, v48 +; SI-NEXT: v_readfirstlane_b32 s78, v49 +; SI-NEXT: v_readfirstlane_b32 s30, v50 +; SI-NEXT: v_readfirstlane_b32 s24, v53 +; SI-NEXT: v_readfirstlane_b32 s28, v40 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s20, v43 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v52 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s45, v46 +; SI-NEXT: v_writelane_b32 v62, s45, 27 +; SI-NEXT: v_writelane_b32 v62, s8, 28 +; SI-NEXT: v_writelane_b32 v62, s58, 29 +; SI-NEXT: v_writelane_b32 v62, s59, 30 +; SI-NEXT: v_writelane_b32 v62, s47, 31 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s60, v36 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:312 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:280 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:248 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:216 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:184 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 -; SI-NEXT: v_readfirstlane_b32 s54, v48 -; SI-NEXT: v_readfirstlane_b32 s50, v53 -; SI-NEXT: v_readfirstlane_b32 s78, v49 -; SI-NEXT: v_readfirstlane_b32 s30, v51 -; SI-NEXT: v_readfirstlane_b32 s66, v54 -; SI-NEXT: v_readfirstlane_b32 s91, v40 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s6, v44 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v50 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s4, v10 -; SI-NEXT: v_writelane_b32 v62, s4, 15 -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: v_writelane_b32 v62, s4, 16 -; SI-NEXT: v_writelane_b32 v62, s17, 17 -; SI-NEXT: v_writelane_b32 v62, s15, 18 -; SI-NEXT: v_writelane_b32 v62, s18, 19 -; SI-NEXT: v_writelane_b32 v62, s43, 20 -; SI-NEXT: v_writelane_b32 v62, s42, 21 -; SI-NEXT: v_writelane_b32 v62, s44, 22 -; SI-NEXT: v_writelane_b32 v62, s16, 23 -; SI-NEXT: v_writelane_b32 v62, s49, 24 -; SI-NEXT: v_writelane_b32 v62, s8, 25 -; SI-NEXT: v_writelane_b32 v62, s6, 26 -; SI-NEXT: v_readfirstlane_b32 s45, v52 -; SI-NEXT: v_writelane_b32 v62, s56, 27 -; SI-NEXT: v_writelane_b32 v62, s45, 28 -; SI-NEXT: v_writelane_b32 v62, s53, 29 -; SI-NEXT: v_writelane_b32 v62, s94, 30 -; SI-NEXT: v_writelane_b32 v62, s57, 31 -; SI-NEXT: v_writelane_b32 v62, s58, 32 -; SI-NEXT: v_writelane_b32 v62, s47, 33 -; SI-NEXT: v_readfirstlane_b32 s46, v55 -; SI-NEXT: v_writelane_b32 v62, s40, 34 -; SI-NEXT: v_readfirstlane_b32 s59, v47 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 +; SI-NEXT: v_writelane_b32 v62, s48, 32 +; SI-NEXT: v_writelane_b32 v62, s26, 33 +; SI-NEXT: v_readfirstlane_b32 s46, v60 +; SI-NEXT: v_writelane_b32 v62, s60, 34 +; SI-NEXT: v_readfirstlane_b32 s61, v61 ; SI-NEXT: v_writelane_b32 v62, s46, 35 -; SI-NEXT: v_writelane_b32 v62, s59, 36 -; SI-NEXT: v_writelane_b32 v62, s60, 37 +; SI-NEXT: v_writelane_b32 v62, s61, 36 +; SI-NEXT: v_writelane_b32 v62, s72, 37 ; SI-NEXT: v_writelane_b32 v62, s36, 38 -; SI-NEXT: v_writelane_b32 v62, s65, 39 -; SI-NEXT: v_writelane_b32 v62, s61, 40 -; SI-NEXT: v_writelane_b32 v62, s73, 41 -; SI-NEXT: v_writelane_b32 v62, s62, 42 -; SI-NEXT: v_writelane_b32 v62, s72, 43 -; SI-NEXT: v_writelane_b32 v62, s23, 44 -; SI-NEXT: v_writelane_b32 v62, s48, 45 -; SI-NEXT: v_writelane_b32 v62, s34, 46 +; SI-NEXT: v_writelane_b32 v62, s94, 39 +; SI-NEXT: v_writelane_b32 v62, s73, 40 +; SI-NEXT: v_writelane_b32 v62, s40, 41 +; SI-NEXT: v_writelane_b32 v62, s39, 42 +; SI-NEXT: v_writelane_b32 v62, s74, 43 +; SI-NEXT: v_writelane_b32 v62, s63, 44 +; SI-NEXT: v_writelane_b32 v62, s75, 45 +; SI-NEXT: v_writelane_b32 v62, s95, 46 ; SI-NEXT: v_writelane_b32 v62, s78, 47 ; SI-NEXT: v_writelane_b32 v62, s30, 48 ; SI-NEXT: v_writelane_b32 v62, s54, 49 ; SI-NEXT: v_writelane_b32 v62, s50, 50 -; SI-NEXT: v_writelane_b32 v62, s52, 51 -; SI-NEXT: v_writelane_b32 v62, s82, 52 -; SI-NEXT: v_writelane_b32 v62, s66, 53 -; SI-NEXT: v_readfirstlane_b32 s22, v36 +; SI-NEXT: v_writelane_b32 v62, s25, 51 +; SI-NEXT: v_writelane_b32 v62, s24, 52 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v6 +; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v14 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v30 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v57 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v56 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v58 +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v57 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v59 -; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v56 -; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v60 -; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v45 -; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v61 -; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v42 -; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v41 -; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v31 -; SI-NEXT: v_writelane_b32 v62, s91, 54 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v58 +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v45 +; SI-NEXT: v_lshlrev_b32_e32 v43, 24, v59 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v42 +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v31 +; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v41 +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v55 +; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v5 +; SI-NEXT: v_writelane_b32 v62, s28, 53 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB97_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v5, v13 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: v_readlane_b32 s5, v62, 5 -; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s4, v62, 8 +; SI-NEXT: v_readlane_b32 s5, v62, 7 +; SI-NEXT: v_mov_b32_e32 v13, v21 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_writelane_b32 v62, s4, 55 -; SI-NEXT: v_readlane_b32 s4, v62, 4 +; SI-NEXT: v_writelane_b32 v62, s4, 54 +; SI-NEXT: v_readlane_b32 s4, v62, 6 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_readlane_b32 s5, v62, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 5 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s5, 24 -; SI-NEXT: s_or_b32 s63, s5, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_writelane_b32 v62, s4, 55 +; SI-NEXT: v_readlane_b32 s4, v62, 9 ; SI-NEXT: s_and_b32 s5, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s9, s19, 24 @@ -198826,27 +198954,29 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s10, s29, 8 ; SI-NEXT: s_or_b32 s4, s5, s10 ; SI-NEXT: v_writelane_b32 v62, s4, 56 -; SI-NEXT: s_and_b32 s5, s76, 0xff -; SI-NEXT: v_readlane_b32 s10, v62, 16 +; SI-NEXT: v_writelane_b32 v62, s37, 57 +; SI-NEXT: s_and_b32 s5, s11, 0xff +; SI-NEXT: s_mov_b32 s37, s11 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s11, s10, 24 +; SI-NEXT: s_lshl_b32 s11, s90, 24 +; SI-NEXT: v_readlane_b32 s4, v62, 2 ; SI-NEXT: s_or_b32 s5, s11, s5 -; SI-NEXT: s_and_b32 s11, s26, 0xff +; SI-NEXT: s_and_b32 s11, s4, 0xff +; SI-NEXT: v_readlane_b32 s4, v62, 1 ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s12, s27, 24 +; SI-NEXT: s_lshl_b32 s12, s4, 24 ; SI-NEXT: s_or_b32 s14, s12, s11 ; SI-NEXT: s_and_b32 s11, s83, 0xff -; SI-NEXT: s_lshl_b32 s12, s25, 8 -; SI-NEXT: s_or_b32 s10, s11, s12 -; SI-NEXT: v_writelane_b32 v62, s10, 57 +; SI-NEXT: s_lshl_b32 s12, s68, 8 +; SI-NEXT: s_or_b32 s4, s11, s12 ; SI-NEXT: s_and_b32 s11, s64, 0xff -; SI-NEXT: v_readlane_b32 s10, v62, 15 ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s13, s10, 24 +; SI-NEXT: s_lshl_b32 s13, s65, 24 ; SI-NEXT: s_or_b32 s41, s13, s11 ; SI-NEXT: s_and_b32 s11, s43, 0xff ; SI-NEXT: s_lshl_b32 s13, s15, 8 -; SI-NEXT: s_or_b32 s10, s11, s13 +; SI-NEXT: v_writelane_b32 v62, s4, 58 +; SI-NEXT: s_or_b32 s4, s11, s13 ; SI-NEXT: s_and_b32 s11, s96, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 ; SI-NEXT: s_lshl_b32 s15, s80, 24 @@ -198854,266 +198984,269 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_and_b32 s11, s44, 0xff ; SI-NEXT: s_lshl_b32 s15, s42, 8 ; SI-NEXT: s_or_b32 s13, s11, s15 -; SI-NEXT: s_and_b32 s11, s18, 0xff +; SI-NEXT: s_and_b32 s11, s21, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 ; SI-NEXT: s_lshl_b32 s15, s97, 24 ; SI-NEXT: s_or_b32 s44, s15, s11 -; SI-NEXT: s_and_b32 s11, s59, 0xff +; SI-NEXT: s_and_b32 s11, s61, 0xff ; SI-NEXT: s_lshl_b32 s15, s46, 8 ; SI-NEXT: s_or_b32 s12, s11, s15 ; SI-NEXT: s_and_b32 s11, s45, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s6, 24 +; SI-NEXT: s_lshl_b32 s15, s20, 24 ; SI-NEXT: s_or_b32 s45, s15, s11 ; SI-NEXT: s_and_b32 s11, s30, 0xff ; SI-NEXT: s_lshl_b32 s15, s78, 8 -; SI-NEXT: v_writelane_b32 v62, s10, 58 ; SI-NEXT: s_or_b32 s10, s11, s15 ; SI-NEXT: s_and_b32 s11, s99, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s89, 24 +; SI-NEXT: s_lshl_b32 s15, s22, 24 ; SI-NEXT: s_or_b32 s46, s15, s11 -; SI-NEXT: s_and_b32 s11, s61, 0xff -; SI-NEXT: s_lshl_b32 s15, s60, 8 -; SI-NEXT: s_or_b32 s6, s11, s15 -; SI-NEXT: s_and_b32 s11, s22, 0xff +; SI-NEXT: s_and_b32 s11, s73, 0xff +; SI-NEXT: s_lshl_b32 s15, s72, 8 +; SI-NEXT: v_writelane_b32 v62, s4, 59 +; SI-NEXT: s_or_b32 s4, s11, s15 +; SI-NEXT: s_and_b32 s11, s60, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 ; SI-NEXT: s_lshl_b32 s15, s47, 24 ; SI-NEXT: s_or_b32 s47, s15, s11 -; SI-NEXT: s_and_b32 s11, s57, 0xff -; SI-NEXT: s_lshl_b32 s15, s56, 8 -; SI-NEXT: v_writelane_b32 v62, s6, 59 -; SI-NEXT: s_or_b32 s6, s11, s15 -; SI-NEXT: s_and_b32 s11, s39, 0xff -; SI-NEXT: v_writelane_b32 v62, s6, 60 +; SI-NEXT: s_and_b32 s11, s58, 0xff +; SI-NEXT: s_lshl_b32 s15, s57, 8 +; SI-NEXT: s_mov_b32 s62, s16 +; SI-NEXT: s_or_b32 s16, s11, s15 +; SI-NEXT: s_and_b32 s11, s38, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s95, 24 +; SI-NEXT: s_lshl_b32 s15, s56, 24 ; SI-NEXT: s_or_b32 s56, s15, s11 -; SI-NEXT: s_and_b32 s11, s48, 0xff -; SI-NEXT: s_lshl_b32 s15, s72, 8 -; SI-NEXT: v_readlane_b32 s6, v62, 14 -; SI-NEXT: s_or_b32 s48, s11, s15 -; SI-NEXT: s_and_b32 s11, s6, 0xff +; SI-NEXT: s_and_b32 s11, s75, 0xff +; SI-NEXT: s_lshl_b32 s15, s74, 8 +; SI-NEXT: s_or_b32 s73, s11, s15 +; SI-NEXT: s_and_b32 s11, s91, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 ; SI-NEXT: s_lshl_b32 s15, s31, 24 ; SI-NEXT: s_or_b32 vcc_lo, s15, s11 -; SI-NEXT: s_and_b32 s11, s86, 0xff -; SI-NEXT: s_lshl_b32 s15, s38, 8 +; SI-NEXT: s_and_b32 s11, s89, 0xff +; SI-NEXT: s_lshl_b32 s15, s85, 8 ; SI-NEXT: s_or_b32 s72, s11, s15 ; SI-NEXT: s_and_b32 s11, s71, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 ; SI-NEXT: s_lshl_b32 s15, s81, 24 ; SI-NEXT: s_or_b32 vcc_hi, s15, s11 -; SI-NEXT: s_and_b32 s11, s58, 0xff -; SI-NEXT: s_lshl_b32 s15, s85, 8 +; SI-NEXT: s_and_b32 s11, s59, 0xff +; SI-NEXT: s_lshl_b32 s15, s86, 8 ; SI-NEXT: s_or_b32 s57, s11, s15 ; SI-NEXT: s_and_b32 s11, s69, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s74, 24 -; SI-NEXT: v_writelane_b32 v62, s74, 61 +; SI-NEXT: s_lshl_b32 s15, s34, 24 ; SI-NEXT: s_or_b32 s74, s15, s11 -; SI-NEXT: s_and_b32 s11, s87, 0xff -; SI-NEXT: s_lshl_b32 s15, s21, 8 +; SI-NEXT: s_and_b32 s11, s18, 0xff +; SI-NEXT: s_lshl_b32 s15, s98, 8 ; SI-NEXT: s_or_b32 s58, s11, s15 -; SI-NEXT: s_and_b32 s11, s68, 0xff +; SI-NEXT: s_and_b32 s11, s6, 0xff +; SI-NEXT: v_writelane_b32 v62, s4, 60 ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s28, 24 +; SI-NEXT: s_lshl_b32 s15, s27, 24 +; SI-NEXT: v_writelane_b32 v62, s34, 61 ; SI-NEXT: s_or_b32 s75, s15, s11 -; SI-NEXT: s_and_b32 s11, s24, 0xff +; SI-NEXT: s_and_b32 s11, s66, 0xff ; SI-NEXT: s_lshl_b32 s15, s55, 8 -; SI-NEXT: v_writelane_b32 v62, s25, 62 +; SI-NEXT: v_writelane_b32 v62, s6, 62 ; SI-NEXT: s_or_b32 s59, s11, s15 -; SI-NEXT: s_and_b32 s11, s37, 0xff +; SI-NEXT: s_and_b32 s11, s76, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 ; SI-NEXT: s_lshl_b32 s15, s51, 24 -; SI-NEXT: v_readlane_b32 s4, v62, 13 -; SI-NEXT: s_mov_b32 s18, s21 -; SI-NEXT: s_mov_b32 s21, s97 -; SI-NEXT: s_mov_b32 s97, s37 -; SI-NEXT: s_mov_b32 s37, s76 +; SI-NEXT: v_writelane_b32 v62, s85, 63 +; SI-NEXT: s_mov_b32 s4, s97 +; SI-NEXT: s_mov_b32 s97, s76 ; SI-NEXT: s_or_b32 s76, s15, s11 +; SI-NEXT: v_readlane_b32 s15, v62, 14 ; SI-NEXT: s_and_b32 s11, s35, 0xff -; SI-NEXT: s_lshl_b32 s15, s4, 8 +; SI-NEXT: s_lshl_b32 s15, s15, 8 ; SI-NEXT: s_or_b32 s60, s11, s15 ; SI-NEXT: s_and_b32 s11, s77, 0xff -; SI-NEXT: v_readlane_b32 s4, v62, 12 +; SI-NEXT: v_readlane_b32 s15, v62, 13 ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s4, 24 -; SI-NEXT: v_readlane_b32 s4, v62, 11 -; SI-NEXT: s_mov_b32 s6, s95 -; SI-NEXT: s_mov_b32 s95, s39 -; SI-NEXT: s_mov_b32 s39, s89 -; SI-NEXT: s_mov_b32 s89, s99 +; SI-NEXT: s_lshl_b32 s15, s15, 24 +; SI-NEXT: s_mov_b32 s21, s20 +; SI-NEXT: s_mov_b32 s20, s38 +; SI-NEXT: s_mov_b32 s38, s99 ; SI-NEXT: s_mov_b32 s99, s83 ; SI-NEXT: s_mov_b32 s83, s55 ; SI-NEXT: s_mov_b32 s55, s64 ; SI-NEXT: s_mov_b32 s64, s35 ; SI-NEXT: s_mov_b32 s35, s77 ; SI-NEXT: s_or_b32 s77, s15, s11 -; SI-NEXT: s_and_b32 s11, s4, 0xff -; SI-NEXT: v_readlane_b32 s4, v62, 10 -; SI-NEXT: s_lshl_b32 s15, s4, 8 -; SI-NEXT: v_readlane_b32 s4, v62, 9 +; SI-NEXT: s_and_b32 s11, s82, 0xff +; SI-NEXT: s_lshl_b32 s15, s52, 8 ; SI-NEXT: s_or_b32 s61, s11, s15 -; SI-NEXT: s_and_b32 s11, s4, 0xff -; SI-NEXT: v_readlane_b32 s4, v62, 8 +; SI-NEXT: v_readlane_b32 s11, v62, 12 +; SI-NEXT: s_and_b32 s11, s11, 0xff +; SI-NEXT: v_readlane_b32 s15, v62, 11 ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s4, 24 +; SI-NEXT: s_lshl_b32 s15, s15, 24 ; SI-NEXT: s_or_b32 s78, s15, s11 -; SI-NEXT: v_readlane_b32 s11, v62, 7 +; SI-NEXT: v_readlane_b32 s11, v62, 10 ; SI-NEXT: s_and_b32 s11, s11, 0xff ; SI-NEXT: s_lshl_b32 s15, s17, 8 ; SI-NEXT: s_or_b32 s11, s11, s15 ; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_mov_b32 s30, s18 +; SI-NEXT: s_mov_b32 s18, s89 +; SI-NEXT: s_mov_b32 s89, s98 +; SI-NEXT: s_mov_b32 s98, s96 +; SI-NEXT: s_mov_b32 s96, s66 +; SI-NEXT: s_mov_b32 s66, s82 +; SI-NEXT: s_mov_b32 s82, s52 ; SI-NEXT: v_mov_b32_e32 v51, s9 -; SI-NEXT: s_or_b32 s17, s11, s9 -; SI-NEXT: v_readlane_b32 s9, v62, 2 -; SI-NEXT: v_readlane_b32 s11, v62, 1 +; SI-NEXT: s_or_b32 s52, s11, s9 +; SI-NEXT: v_readlane_b32 s9, v62, 4 +; SI-NEXT: v_readlane_b32 s11, v62, 3 ; SI-NEXT: s_and_b32 s9, s9, 0xff ; SI-NEXT: s_lshl_b32 s15, s11, 8 ; SI-NEXT: s_or_b32 s9, s9, s15 ; SI-NEXT: s_and_b32 s9, s9, 0xffff -; SI-NEXT: s_mov_b32 s4, s96 -; SI-NEXT: s_mov_b32 s96, s24 ; SI-NEXT: v_mov_b32_e32 v52, s14 -; SI-NEXT: s_or_b32 s24, s9, s14 +; SI-NEXT: s_or_b32 s17, s9, s14 ; SI-NEXT: s_and_b32 s14, s93, 0xff ; SI-NEXT: s_lshl_b32 s15, s84, 8 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v53, v6, v1 +; SI-NEXT: v_or_b32_e32 v53, v60, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: v_or_b32_e32 v50, s14, v53 -; SI-NEXT: s_and_b32 s14, s8, 0xff -; SI-NEXT: s_lshl_b32 s15, s49, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: s_and_b32 s14, s23, 0xff +; SI-NEXT: s_lshl_b32 s15, s53, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v54, v14, v1 +; SI-NEXT: v_or_b32_e32 v54, v61, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: v_or_b32_e32 v17, s14, v54 -; SI-NEXT: s_and_b32 s14, s40, 0xff -; SI-NEXT: s_lshl_b32 s15, s53, 8 +; SI-NEXT: s_and_b32 s14, s48, 0xff +; SI-NEXT: s_lshl_b32 s15, s8, 8 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v55, v18, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: v_or_b32_e32 v16, s14, v55 -; SI-NEXT: s_and_b32 s14, s34, 0xff -; SI-NEXT: s_lshl_b32 s15, s23, 8 +; SI-NEXT: s_and_b32 s14, s95, 0xff +; SI-NEXT: s_lshl_b32 s15, s63, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v40, v19, v1 +; SI-NEXT: v_or_b32_e32 v40, v6, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: v_or_b32_e32 v15, s14, v40 -; SI-NEXT: s_and_b32 s14, s91, 0xff -; SI-NEXT: s_lshl_b32 s15, s66, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v43 +; SI-NEXT: s_and_b32 s14, s28, 0xff +; SI-NEXT: s_lshl_b32 s15, s24, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v44 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v41, v22, v1 +; SI-NEXT: v_or_b32_e32 v41, v20, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: v_or_b32_e32 v12, s14, v41 ; SI-NEXT: s_and_b32 s14, s50, 0xff ; SI-NEXT: s_lshl_b32 s15, s54, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v42, v23, v1 +; SI-NEXT: v_or_b32_e32 v42, v22, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: v_or_b32_e32 v11, s14, v42 -; SI-NEXT: s_and_b32 s14, s73, 0xff +; SI-NEXT: s_and_b32 s14, s40, 0xff ; SI-NEXT: s_lshl_b32 s15, s36, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v28, v59, v1 +; SI-NEXT: v_or_b32_e32 v59, v23, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v10, s14, v28 -; SI-NEXT: s_and_b32 s14, s82, 0xff -; SI-NEXT: s_lshl_b32 s15, s52, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 +; SI-NEXT: v_readlane_b32 s8, v62, 57 +; SI-NEXT: v_or_b32_e32 v10, s14, v59 +; SI-NEXT: s_and_b32 s14, s25, 0xff +; SI-NEXT: s_lshl_b32 s15, s8, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v60, v24, v1 +; SI-NEXT: v_mov_b32_e32 v25, v20 +; SI-NEXT: v_mov_b32_e32 v20, v60 +; SI-NEXT: v_or_b32_e32 v60, v14, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: v_or_b32_e32 v9, s14, v60 -; SI-NEXT: s_and_b32 s14, s90, 0xff -; SI-NEXT: s_lshl_b32 s15, s16, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: s_and_b32 s14, s70, 0xff +; SI-NEXT: s_lshl_b32 s15, s49, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v31, v44, v1 +; SI-NEXT: v_or_b32_e32 v5, v43, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v8, s14, v31 -; SI-NEXT: s_and_b32 s14, s62, 0xff -; SI-NEXT: s_lshl_b32 s15, s65, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: v_or_b32_e32 v8, s14, v5 +; SI-NEXT: s_and_b32 s14, s39, 0xff +; SI-NEXT: s_lshl_b32 s15, s94, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v61, v45, v1 +; SI-NEXT: v_or_b32_e32 v31, v45, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v7, s14, v61 -; SI-NEXT: s_and_b32 s14, s98, 0xff +; SI-NEXT: v_or_b32_e32 v7, s14, v31 +; SI-NEXT: s_and_b32 s14, s87, 0xff ; SI-NEXT: s_lshl_b32 s15, s67, 8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_or_b32_e32 v6, v47, v1 +; SI-NEXT: v_mov_b32_e32 v26, v34 +; SI-NEXT: v_mov_b32_e32 v34, v22 +; SI-NEXT: v_mov_b32_e32 v22, v61 +; SI-NEXT: v_or_b32_e32 v61, v46, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v4, s14, v6 +; SI-NEXT: v_or_b32_e32 v4, s14, v61 ; SI-NEXT: s_and_b32 s14, s92, 0xff ; SI-NEXT: s_lshl_b32 s15, s7, 8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_readlane_b32 s8, v62, 55 +; SI-NEXT: v_readlane_b32 s8, v62, 54 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v22, v14 -; SI-NEXT: v_or_b32_e32 v14, v56, v1 +; SI-NEXT: v_or_b32_e32 v6, v56, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: v_or_b32_e32 v2, s14, v14 -; SI-NEXT: s_and_b32 s14, s70, 0xff -; SI-NEXT: s_lshl_b32 s15, s94, 8 +; SI-NEXT: v_readlane_b32 s11, v62, 55 +; SI-NEXT: v_mov_b32_e32 v28, v36 +; SI-NEXT: v_or_b32_e32 v36, s14, v6 +; SI-NEXT: s_and_b32 s14, s26, 0xff +; SI-NEXT: s_lshl_b32 s15, s62, 8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 -; SI-NEXT: s_or_b32 s42, s8, s63 +; SI-NEXT: s_or_b32 s42, s8, s11 ; SI-NEXT: v_readlane_b32 s8, v62, 56 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v32, v23 -; SI-NEXT: v_mov_b32_e32 v23, v18 -; SI-NEXT: v_or_b32_e32 v18, v57, v1 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_or_b32_e32 v14, v57, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_or_b32 s40, s8, s5 -; SI-NEXT: v_readlane_b32 s8, v62, 57 -; SI-NEXT: v_or_b32_e32 v1, s14, v18 +; SI-NEXT: v_readlane_b32 s8, v62, 58 +; SI-NEXT: v_or_b32_e32 v1, s14, v14 ; SI-NEXT: s_and_b32 s14, s88, 0xff ; SI-NEXT: s_lshl_b32 s15, s79, 8 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v37 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: v_readlane_b32 s9, v62, 60 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_or_b32 s15, s8, s41 -; SI-NEXT: v_readlane_b32 s8, v62, 58 -; SI-NEXT: s_and_b32 s16, s9, 0xffff -; SI-NEXT: v_mov_b32_e32 v27, v26 -; SI-NEXT: v_mov_b32_e32 v26, v24 -; SI-NEXT: v_mov_b32_e32 v24, v19 -; SI-NEXT: v_or_b32_e32 v19, v58, v3 +; SI-NEXT: v_readlane_b32 s8, v62, 59 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: v_mov_b32_e32 v27, v35 +; SI-NEXT: v_mov_b32_e32 v35, v23 +; SI-NEXT: v_mov_b32_e32 v23, v18 +; SI-NEXT: v_or_b32_e32 v18, v58, v3 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: s_or_b32 s36, s16, s56 -; SI-NEXT: s_and_b32 s16, s48, 0xffff -; SI-NEXT: v_or_b32_e32 v3, s14, v19 +; SI-NEXT: s_and_b32 s16, s73, 0xffff +; SI-NEXT: v_or_b32_e32 v3, s14, v18 ; SI-NEXT: s_or_b32 s14, s8, s43 ; SI-NEXT: s_and_b32 s8, s13, 0xffff ; SI-NEXT: s_or_b32 s53, s16, vcc_lo @@ -199127,49 +199260,46 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s49, s16, s74 ; SI-NEXT: s_and_b32 s16, s58, 0xffff ; SI-NEXT: s_or_b32 s10, s8, s46 -; SI-NEXT: v_readlane_b32 s8, v62, 59 +; SI-NEXT: v_readlane_b32 s8, v62, 60 ; SI-NEXT: s_or_b32 s48, s16, s75 ; SI-NEXT: s_and_b32 s16, s59, 0xffff ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_or_b32 s11, s16, s76 +; SI-NEXT: s_or_b32 s39, s16, s76 ; SI-NEXT: s_and_b32 s16, s60, 0xffff ; SI-NEXT: s_and_b32 s23, s61, 0xffff -; SI-NEXT: s_mov_b32 s30, s87 -; SI-NEXT: s_mov_b32 s87, s85 ; SI-NEXT: s_or_b32 s8, s8, s47 -; SI-NEXT: s_or_b32 s9, s16, s77 -; SI-NEXT: s_or_b32 s16, s23, s78 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v30, v37 -; SI-NEXT: v_mov_b32_e32 v35, v45 -; SI-NEXT: v_mov_b32_e32 v20, v47 -; SI-NEXT: v_mov_b32_e32 v49, v56 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v39, v57 -; SI-NEXT: v_mov_b32_e32 v25, v58 +; SI-NEXT: s_or_b32 s70, s16, s77 +; SI-NEXT: s_or_b32 s9, s23, s78 +; SI-NEXT: v_mov_b32_e32 v24, v45 +; SI-NEXT: v_mov_b32_e32 v48, v46 +; SI-NEXT: v_mov_b32_e32 v30, v56 +; SI-NEXT: v_mov_b32_e32 v49, v57 +; SI-NEXT: v_mov_b32_e32 v2, v58 ; SI-NEXT: v_alignbit_b32 v57, s42, v51, 16 ; SI-NEXT: v_alignbit_b32 v58, s40, v52, 16 ; SI-NEXT: v_alignbit_b32 v56, s15, v53, 16 ; SI-NEXT: v_alignbit_b32 v47, s14, v54, 16 -; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_alignbit_b32 v46, s13, v55, 16 ; SI-NEXT: v_alignbit_b32 v45, s12, v40, 16 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v44, s10, v41, 16 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v43, s8, v42, 16 -; SI-NEXT: v_alignbit_b32 v42, s36, v28, 16 +; SI-NEXT: v_alignbit_b32 v42, s36, v59, 16 ; SI-NEXT: v_alignbit_b32 v41, s53, v60, 16 -; SI-NEXT: v_alignbit_b32 v40, s94, v31, 16 -; SI-NEXT: v_alignbit_b32 v55, s49, v61, 16 -; SI-NEXT: v_alignbit_b32 v54, s48, v6, 16 -; SI-NEXT: v_alignbit_b32 v53, s11, v14, 16 -; SI-NEXT: v_mov_b32_e32 v14, v22 -; SI-NEXT: v_alignbit_b32 v52, s9, v18, 16 +; SI-NEXT: v_mov_b32_e32 v60, v20 +; SI-NEXT: v_mov_b32_e32 v20, v25 +; SI-NEXT: v_alignbit_b32 v40, s94, v5, 16 +; SI-NEXT: v_alignbit_b32 v55, s49, v31, 16 +; SI-NEXT: v_alignbit_b32 v54, s48, v61, 16 +; SI-NEXT: v_mov_b32_e32 v61, v22 +; SI-NEXT: v_mov_b32_e32 v22, v34 +; SI-NEXT: v_alignbit_b32 v53, s39, v6, 16 +; SI-NEXT: s_mov_b32 s16, s62 +; SI-NEXT: v_alignbit_b32 v52, s70, v14, 16 +; SI-NEXT: v_alignbit_b32 v51, s9, v18, 16 ; SI-NEXT: v_mov_b32_e32 v18, v23 -; SI-NEXT: v_alignbit_b32 v51, s16, v19, 16 -; SI-NEXT: v_mov_b32_e32 v19, v24 -; SI-NEXT: v_mov_b32_e32 v24, v26 -; SI-NEXT: s_lshr_b32 s73, s63, 16 +; SI-NEXT: v_mov_b32_e32 v23, v35 +; SI-NEXT: s_lshr_b32 s73, s11, 16 ; SI-NEXT: s_lshr_b32 s72, s5, 16 ; SI-NEXT: s_lshr_b32 s63, s41, 16 ; SI-NEXT: s_lshr_b32 s62, s43, 16 @@ -199181,87 +199311,85 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s56, vcc_lo, 16 ; SI-NEXT: s_lshr_b32 s47, vcc_hi, 16 ; SI-NEXT: s_lshr_b32 s46, s74, 16 -; SI-NEXT: v_readlane_b32 s25, v62, 62 -; SI-NEXT: v_readlane_b32 s74, v62, 61 +; SI-NEXT: v_readlane_b32 s34, v62, 61 ; SI-NEXT: s_lshr_b32 s45, s75, 16 +; SI-NEXT: v_readlane_b32 s6, v62, 62 ; SI-NEXT: s_lshr_b32 s44, s76, 16 -; SI-NEXT: s_mov_b32 s76, s37 -; SI-NEXT: s_mov_b32 s37, s97 -; SI-NEXT: s_mov_b32 s97, s21 -; SI-NEXT: s_mov_b32 s21, s18 -; SI-NEXT: s_mov_b32 s18, s17 -; SI-NEXT: s_mov_b32 s85, s87 -; SI-NEXT: s_mov_b32 s87, s30 -; SI-NEXT: s_mov_b32 s17, s24 +; SI-NEXT: s_mov_b32 s11, s37 +; SI-NEXT: v_readlane_b32 s37, v62, 57 +; SI-NEXT: s_mov_b32 s76, s97 +; SI-NEXT: s_mov_b32 s97, s4 +; SI-NEXT: v_readlane_b32 s85, v62, 63 ; SI-NEXT: s_lshr_b32 s43, s77, 16 ; SI-NEXT: s_mov_b32 s77, s35 ; SI-NEXT: s_mov_b32 s35, s64 ; SI-NEXT: s_mov_b32 s64, s55 ; SI-NEXT: s_mov_b32 s55, s83 ; SI-NEXT: s_mov_b32 s83, s99 -; SI-NEXT: s_mov_b32 s99, s89 -; SI-NEXT: s_mov_b32 s89, s39 -; SI-NEXT: s_mov_b32 s39, s95 -; SI-NEXT: s_mov_b32 s95, s6 +; SI-NEXT: s_mov_b32 s99, s38 +; SI-NEXT: s_mov_b32 s38, s20 +; SI-NEXT: s_mov_b32 s20, s21 +; SI-NEXT: s_mov_b32 s21, s52 ; SI-NEXT: s_lshr_b32 s41, s78, 16 -; SI-NEXT: s_mov_b32 s24, s96 -; SI-NEXT: s_mov_b32 s96, s4 +; SI-NEXT: s_mov_b32 s52, s82 +; SI-NEXT: s_mov_b32 s82, s66 +; SI-NEXT: s_mov_b32 s66, s96 +; SI-NEXT: s_mov_b32 s96, s98 +; SI-NEXT: s_mov_b32 s98, s89 +; SI-NEXT: s_mov_b32 s89, s18 +; SI-NEXT: s_mov_b32 s18, s30 +; SI-NEXT: v_mov_b32_e32 v6, v19 +; SI-NEXT: v_mov_b32_e32 v14, v21 ; SI-NEXT: s_cbranch_execnz .LBB97_3 ; SI-NEXT: .LBB97_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v36 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_mov_b32_e32 v6, v5 -; SI-NEXT: v_mov_b32_e32 v5, v27 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: s_add_i32 s4, s88, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s79, 8 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: v_readlane_b32 s4, v62, 11 -; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 10 -; SI-NEXT: v_readlane_b32 s6, v62, 9 +; SI-NEXT: s_add_i32 s4, s82, 3 +; SI-NEXT: v_readlane_b32 s8, v62, 12 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_add_i32 s8, s6, 3 +; SI-NEXT: s_lshl_b32 s5, s52, 8 +; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v62, 8 +; SI-NEXT: v_readlane_b32 s5, v62, 11 ; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_add_i32 s70, s70, 3 -; SI-NEXT: v_readlane_b32 s6, v62, 30 ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 33 +; SI-NEXT: s_add_i32 s70, s5, 3 ; SI-NEXT: s_and_b32 s5, s70, 0xff -; SI-NEXT: s_lshl_b32 s8, s6, 8 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v48 +; SI-NEXT: s_lshl_b32 s8, s16, 8 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v39 ; SI-NEXT: s_or_b32 s5, s8, s5 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_addk_i32 s5, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_or_b32_e32 v2, v49, v2 ; SI-NEXT: v_or_b32_e32 v2, s5, v2 ; SI-NEXT: s_add_i32 s5, s35, 3 -; SI-NEXT: v_readlane_b32 s6, v62, 13 +; SI-NEXT: v_readlane_b32 s8, v62, 14 ; SI-NEXT: s_and_b32 s5, s5, 0xff -; SI-NEXT: s_lshl_b32 s8, s6, 8 +; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: s_add_i32 s9, s77, 3 ; SI-NEXT: s_or_b32 s5, s8, s5 -; SI-NEXT: v_readlane_b32 s6, v62, 12 +; SI-NEXT: v_readlane_b32 s8, v62, 13 ; SI-NEXT: s_and_b32 s9, s9, 0xff -; SI-NEXT: s_lshl_b32 s8, s6, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 24 ; SI-NEXT: s_lshl_b32 s9, s9, 16 ; SI-NEXT: s_addk_i32 s5, 0x300 ; SI-NEXT: s_or_b32 s8, s8, s9 @@ -199269,9 +199397,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s5, s8, s5 ; SI-NEXT: s_add_i32 s79, s92, 3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v1 -; SI-NEXT: s_add_i32 s16, s4, 0x3000000 +; SI-NEXT: s_add_i32 s9, s4, 0x3000000 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v2 -; SI-NEXT: s_add_i32 s9, s5, 0x3000000 +; SI-NEXT: s_add_i32 s70, s5, 0x3000000 ; SI-NEXT: s_and_b32 s4, s79, 0xff ; SI-NEXT: s_lshl_b32 s5, s7, 8 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 @@ -199280,16 +199408,12 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v2, v49, v2 +; SI-NEXT: v_or_b32_e32 v2, v30, v2 ; SI-NEXT: v_or_b32_e32 v2, s4, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: s_add_i32 s4, s24, 3 +; SI-NEXT: s_add_i32 s4, s66, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s55, 8 -; SI-NEXT: s_add_i32 s8, s37, 3 +; SI-NEXT: s_add_i32 s8, s76, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 @@ -199298,8 +199422,8 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s52, s98, 3 -; SI-NEXT: s_add_i32 s11, s4, 0x3000000 +; SI-NEXT: s_add_i32 s52, s87, 3 +; SI-NEXT: s_add_i32 s39, s4, 0x3000000 ; SI-NEXT: s_and_b32 s4, s52, 0xff ; SI-NEXT: s_lshl_b32 s5, s67, 8 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v38 @@ -199308,64 +199432,68 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v4, v20, v4 -; SI-NEXT: s_add_i32 s30, s87, 3 +; SI-NEXT: v_or_b32_e32 v4, v48, v4 +; SI-NEXT: s_add_i32 s30, s18, 3 ; SI-NEXT: v_or_b32_e32 v4, s4, v4 ; SI-NEXT: s_and_b32 s4, s30, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 -; SI-NEXT: s_add_i32 s8, s68, 3 +; SI-NEXT: s_lshl_b32 s5, s98, 8 +; SI-NEXT: s_add_i32 s8, s6, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s28, 24 +; SI-NEXT: s_lshl_b32 s5, s27, 24 ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s48, s4, 0x3000000 ; SI-NEXT: v_readlane_b32 s4, v62, 42 -; SI-NEXT: v_mov_b32_e32 v22, v30 ; SI-NEXT: s_add_i32 s87, s4, 3 ; SI-NEXT: v_readlane_b32 s5, v62, 39 ; SI-NEXT: s_and_b32 s4, s87, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v32 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v7, v35, v7 -; SI-NEXT: v_or_b32_e32 v7, s4, v7 -; SI-NEXT: v_readlane_b32 s4, v62, 32 +; SI-NEXT: v_or_b32_e32 v5, v24, v5 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: v_readlane_b32 s4, v62, 30 ; SI-NEXT: s_add_i32 s67, s4, 3 ; SI-NEXT: s_and_b32 s4, s67, 0xff -; SI-NEXT: s_lshl_b32 s5, s85, 8 +; SI-NEXT: s_lshl_b32 s5, s86, 8 ; SI-NEXT: s_add_i32 s8, s69, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s74, 24 +; SI-NEXT: s_lshl_b32 s5, s34, 24 ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s50, s90, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 23 ; SI-NEXT: s_add_i32 s49, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v62, 24 +; SI-NEXT: v_mov_b32_e32 v25, v28 +; SI-NEXT: s_add_i32 s50, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 22 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v5 ; SI-NEXT: s_and_b32 s4, s50, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v25 ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_add_i32 s94, s86, 3 -; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: s_add_i32 s94, s89, 3 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 ; SI-NEXT: s_and_b32 s4, s94, 0xff -; SI-NEXT: s_lshl_b32 s5, s38, 8 +; SI-NEXT: s_lshl_b32 s5, s85, 8 ; SI-NEXT: s_add_i32 s8, s71, 3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 @@ -199375,25 +199503,27 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s94, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 52 +; SI-NEXT: v_readlane_b32 s4, v62, 51 +; SI-NEXT: v_mov_b32_e32 v21, v27 ; SI-NEXT: s_add_i32 s18, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 51 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v5 ; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_lshl_b32 s5, s37, 8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v21 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v9, v24, v9 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 ; SI-NEXT: v_readlane_b32 s4, v62, 45 ; SI-NEXT: s_add_i32 s98, s4, 3 ; SI-NEXT: v_readlane_b32 s5, v62, 43 -; SI-NEXT: v_readlane_b32 s6, v62, 14 ; SI-NEXT: s_and_b32 s4, s98, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_add_i32 s8, s6, 3 +; SI-NEXT: s_add_i32 s8, s91, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 @@ -199404,27 +199534,33 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s53, s4, 0x3000000 ; SI-NEXT: v_readlane_b32 s4, v62, 41 +; SI-NEXT: v_mov_b32_e32 v19, v26 ; SI-NEXT: s_add_i32 s86, s4, 3 ; SI-NEXT: v_readlane_b32 s5, v62, 38 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v5 ; SI-NEXT: s_and_b32 s4, s86, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v19 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v10, v59, v10 -; SI-NEXT: v_or_b32_e32 v10, s4, v10 -; SI-NEXT: v_readlane_b32 s4, v62, 31 +; SI-NEXT: v_or_b32_e32 v5, v23, v5 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s4, v62, 29 ; SI-NEXT: s_add_i32 s66, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 27 +; SI-NEXT: v_readlane_b32 s5, v62, 26 ; SI-NEXT: s_and_b32 s4, s66, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_add_i32 s37, s39, 3 +; SI-NEXT: s_add_i32 s37, s38, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 21 ; SI-NEXT: s_and_b32 s8, s37, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s95, 24 +; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s8 @@ -199439,33 +199575,65 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_readlane_b32 s5, v62, 37 +; SI-NEXT: v_readlane_b32 s6, v62, 34 ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: s_add_i32 s71, s22, 3 +; SI-NEXT: s_add_i32 s71, s6, 3 ; SI-NEXT: s_and_b32 s8, s71, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_add_i32 s35, s99, 3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v32, v11 -; SI-NEXT: v_or_b32_e32 v11, s4, v11 +; SI-NEXT: s_and_b32 s6, s35, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mov_b32_e32 v31, s9 +; SI-NEXT: v_mov_b32_e32 v39, s70 +; SI-NEXT: v_add_i32_e32 v36, vcc, 0x3000000, v2 +; SI-NEXT: v_mov_b32_e32 v28, s39 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v4 +; SI-NEXT: v_mov_b32_e32 v27, s48 +; SI-NEXT: v_mov_b32_e32 v26, s49 +; SI-NEXT: v_mov_b32_e32 v25, s94 +; SI-NEXT: v_mov_b32_e32 v24, s53 +; SI-NEXT: v_mov_b32_e32 v23, s36 +; SI-NEXT: v_alignbit_b32 v42, v23, v10, 16 +; SI-NEXT: v_alignbit_b32 v41, v24, v9, 16 +; SI-NEXT: v_alignbit_b32 v40, v25, v8, 16 +; SI-NEXT: v_alignbit_b32 v55, v26, v7, 16 +; SI-NEXT: v_alignbit_b32 v54, v27, v4, 16 +; SI-NEXT: v_alignbit_b32 v53, v28, v36, 16 +; SI-NEXT: v_alignbit_b32 v52, v39, v1, 16 +; SI-NEXT: v_alignbit_b32 v51, v31, v3, 16 +; SI-NEXT: s_lshr_b32 s57, s36, 16 +; SI-NEXT: s_lshr_b32 s56, s53, 16 +; SI-NEXT: s_lshr_b32 s47, s94, 16 +; SI-NEXT: s_lshr_b32 s46, s49, 16 +; SI-NEXT: s_lshr_b32 s45, s48, 16 +; SI-NEXT: s_lshr_b32 s44, s39, 16 +; SI-NEXT: s_lshr_b32 s43, s70, 16 +; SI-NEXT: s_lshr_b32 s41, s9, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v22, v5 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: v_readlane_b32 s4, v62, 40 ; SI-NEXT: s_add_i32 s85, s4, 3 ; SI-NEXT: s_and_b32 s4, s85, 0xff ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v62, 33 +; SI-NEXT: v_readlane_b32 s5, v62, 31 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s8, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 54 +; SI-NEXT: v_readlane_b32 s4, v62, 53 ; SI-NEXT: s_add_i32 s17, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 53 +; SI-NEXT: v_readlane_b32 s5, v62, 52 ; SI-NEXT: s_and_b32 s4, s17, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s5, s4 @@ -199473,58 +199641,25 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_readlane_b32 s5, v62, 47 ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_and_b32 s6, s35, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: v_mov_b32_e32 v30, s16 -; SI-NEXT: v_mov_b32_e32 v39, s9 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 -; SI-NEXT: v_mov_b32_e32 v28, s11 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v4 -; SI-NEXT: v_mov_b32_e32 v27, s48 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 -; SI-NEXT: v_mov_b32_e32 v26, s49 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v8 -; SI-NEXT: v_mov_b32_e32 v25, s94 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9 -; SI-NEXT: v_mov_b32_e32 v24, s53 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v10 -; SI-NEXT: v_mov_b32_e32 v23, s36 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v11 ; SI-NEXT: v_mov_b32_e32 v22, s8 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v43, v22, v11, 16 -; SI-NEXT: v_alignbit_b32 v42, v23, v10, 16 -; SI-NEXT: v_alignbit_b32 v41, v24, v9, 16 -; SI-NEXT: v_alignbit_b32 v40, v25, v8, 16 -; SI-NEXT: v_alignbit_b32 v55, v26, v7, 16 -; SI-NEXT: v_alignbit_b32 v54, v27, v4, 16 -; SI-NEXT: v_alignbit_b32 v53, v28, v2, 16 -; SI-NEXT: v_alignbit_b32 v52, v39, v1, 16 -; SI-NEXT: v_alignbit_b32 v51, v30, v3, 16 ; SI-NEXT: s_lshr_b32 s58, s8, 16 -; SI-NEXT: s_lshr_b32 s57, s36, 16 -; SI-NEXT: s_lshr_b32 s56, s53, 16 -; SI-NEXT: s_lshr_b32 s47, s94, 16 -; SI-NEXT: s_lshr_b32 s46, s49, 16 -; SI-NEXT: s_lshr_b32 s45, s48, 16 -; SI-NEXT: s_lshr_b32 s44, s11, 16 -; SI-NEXT: s_lshr_b32 s43, s9, 16 -; SI-NEXT: s_lshr_b32 s41, s16, 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v12, v5 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 ; SI-NEXT: v_or_b32_e32 v5, s4, v5 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: v_readlane_b32 s4, v62, 48 ; SI-NEXT: s_add_i32 s7, s4, 3 ; SI-NEXT: s_and_b32 s4, s7, 0xff ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s89, 24 +; SI-NEXT: s_lshl_b32 s5, s22, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_or_b32 s4, s5, s4 @@ -199539,32 +199674,35 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v5, v19, v5 +; SI-NEXT: v_readlane_b32 s5, v62, 35 +; SI-NEXT: v_readlane_b32 s6, v62, 27 +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s55, s6, 3 +; SI-NEXT: s_and_b32 s6, s55, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_mov_b32_e32 v20, s10 +; SI-NEXT: v_alignbit_b32 v44, v20, v12, 16 +; SI-NEXT: s_lshr_b32 s59, s10, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v14, v5 ; SI-NEXT: v_or_b32_e32 v5, s4, v5 ; SI-NEXT: v_readlane_b32 s4, v62, 36 ; SI-NEXT: s_add_i32 s81, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 35 -; SI-NEXT: v_readlane_b32 s6, v62, 28 ; SI-NEXT: s_and_b32 s4, s81, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_add_i32 s55, s6, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v62, 26 -; SI-NEXT: s_and_b32 s6, s55, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s5, 24 -; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s5, s20, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s12, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 34 +; SI-NEXT: v_readlane_b32 s4, v62, 32 ; SI-NEXT: s_add_i32 s69, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 29 +; SI-NEXT: v_readlane_b32 s5, v62, 28 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v5 ; SI-NEXT: s_and_b32 s4, s69, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v6 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: s_addk_i32 s4, 0x300 @@ -199572,10 +199710,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_or_b32_e32 v5, v18, v5 ; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: v_readlane_b32 s4, v62, 22 +; SI-NEXT: v_readlane_b32 s4, v62, 20 ; SI-NEXT: s_add_i32 s34, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 21 -; SI-NEXT: v_readlane_b32 s6, v62, 19 +; SI-NEXT: v_readlane_b32 s5, v62, 19 +; SI-NEXT: v_readlane_b32 s6, v62, 17 ; SI-NEXT: s_and_b32 s4, s34, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_add_i32 s92, s6, 3 @@ -199590,21 +199728,21 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_add_i32 s13, s4, 0x3000000 ; SI-NEXT: v_readlane_b32 s4, v62, 25 ; SI-NEXT: s_add_i32 s51, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 24 +; SI-NEXT: v_readlane_b32 s5, v62, 23 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v5 ; SI-NEXT: s_and_b32 s4, s51, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v29 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v5, v14, v5 +; SI-NEXT: v_or_b32_e32 v5, v61, v5 ; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: v_readlane_b32 s4, v62, 20 +; SI-NEXT: v_readlane_b32 s4, v62, 18 ; SI-NEXT: s_add_i32 s95, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 18 +; SI-NEXT: v_readlane_b32 s5, v62, 16 ; SI-NEXT: s_and_b32 s4, s95, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_add_i32 s6, s96, 3 @@ -199621,37 +199759,35 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s84, 8 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v6, v6, v13 +; SI-NEXT: v_or_b32_e32 v6, v60, v13 ; SI-NEXT: v_or_b32_e32 v6, s4, v6 ; SI-NEXT: s_add_i32 s4, s83, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_lshl_b32 s5, s68, 8 ; SI-NEXT: s_add_i32 s6, s64, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v62, 15 ; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s5, s65, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s15, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 2 +; SI-NEXT: v_readlane_b32 s4, v62, 4 ; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 1 +; SI-NEXT: v_readlane_b32 s5, v62, 3 +; SI-NEXT: v_readlane_b32 s6, v62, 2 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_add_i32 s6, s26, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 1 ; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s27, 24 +; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 @@ -199661,21 +199797,20 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_add_i32 s4, s4, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: s_add_i32 s6, s76, 3 +; SI-NEXT: s_add_i32 s6, s11, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v62, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s5, s90, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s40, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 7 +; SI-NEXT: v_readlane_b32 s4, v62, 10 ; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 17 -; SI-NEXT: v_readlane_b32 s6, v62, 6 +; SI-NEXT: v_readlane_b32 s5, v62, 15 +; SI-NEXT: v_readlane_b32 s6, v62, 9 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_add_i32 s6, s6, 3 @@ -199687,15 +199822,16 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s18, s4, 0x3000000 -; SI-NEXT: s_add_i32 s4, s20, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 5 -; SI-NEXT: v_readlane_b32 s6, v62, 4 +; SI-NEXT: s_add_i32 s21, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v62, 8 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 7 +; SI-NEXT: v_readlane_b32 s6, v62, 6 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v62, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 5 ; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_lshl_b32 s5, s5, 24 @@ -199704,8 +199840,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s42, s4, 0x3000000 -; SI-NEXT: v_mov_b32_e32 v13, s18 -; SI-NEXT: v_mov_b32_e32 v20, s10 +; SI-NEXT: v_mov_b32_e32 v13, s21 ; SI-NEXT: v_mov_b32_e32 v19, s12 ; SI-NEXT: v_mov_b32_e32 v18, s13 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v5 @@ -199717,20 +199852,16 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_alignbit_b32 v58, s40, v13, 16 ; SI-NEXT: v_alignbit_b32 v56, v6, v50, 16 ; SI-NEXT: v_alignbit_b32 v47, v5, v17, 16 -; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_alignbit_b32 v46, v18, v16, 16 ; SI-NEXT: v_alignbit_b32 v45, v19, v15, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v44, v20, v12, 16 ; SI-NEXT: s_lshr_b32 s73, s42, 16 ; SI-NEXT: s_lshr_b32 s72, s40, 16 ; SI-NEXT: s_lshr_b32 s63, s15, 16 ; SI-NEXT: s_lshr_b32 s62, s14, 16 ; SI-NEXT: s_lshr_b32 s61, s13, 16 ; SI-NEXT: s_lshr_b32 s60, s12, 16 -; SI-NEXT: s_lshr_b32 s59, s10, 16 ; SI-NEXT: .LBB97_3: ; %end -; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_and_b32 s4, s21, 0xffff ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 ; SI-NEXT: v_or_b32_e32 v5, s4, v5 ; SI-NEXT: s_and_b32 s4, s42, 0xffff @@ -199739,6 +199870,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_mov_b32_e32 v6, s4 ; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen ; SI-NEXT: s_and_b32 s4, s17, 0xffff @@ -199896,9 +200028,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v0 ; SI-NEXT: v_mov_b32_e32 v5, s4 ; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v36 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v53 -; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: s_and_b32 s4, s39, 0xffff ; SI-NEXT: s_lshl_b32 s5, s44, 16 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 @@ -199910,7 +200042,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: s_and_b32 s4, s70, 0xffff ; SI-NEXT: s_lshl_b32 s5, s43, 16 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 @@ -199923,7 +200055,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_and_b32 s4, s9, 0xffff ; SI-NEXT: s_lshl_b32 s5, s41, 16 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 @@ -199984,27 +200116,26 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_readlane_b32 s31, v63, 1 ; SI-NEXT: v_readlane_b32 s30, v63, 0 ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB97_4: -; SI-NEXT: v_mov_b32_e32 v5, v13 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v25, v58 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v39, v57 -; SI-NEXT: v_mov_b32_e32 v49, v56 -; SI-NEXT: v_mov_b32_e32 v20, v47 -; SI-NEXT: v_mov_b32_e32 v30, v37 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v35, v45 -; SI-NEXT: v_mov_b32_e32 v27, v26 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v32, v23 -; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v2, v58 +; SI-NEXT: v_mov_b32_e32 v49, v57 +; SI-NEXT: v_mov_b32_e32 v30, v56 +; SI-NEXT: v_mov_b32_e32 v48, v46 +; SI-NEXT: v_mov_b32_e32 v24, v45 +; SI-NEXT: v_mov_b32_e32 v28, v36 +; SI-NEXT: v_mov_b32_e32 v27, v35 +; SI-NEXT: v_mov_b32_e32 v26, v34 +; SI-NEXT: v_mov_b32_e32 v13, v21 +; SI-NEXT: ; implicit-def: $sgpr21 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr73 @@ -200056,17 +200187,17 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr39 ; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr70 ; SI-NEXT: ; implicit-def: $sgpr43 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: ; implicit-def: $sgpr41 ; SI-NEXT: s_branch .LBB97_2 ; @@ -200130,13 +200261,14 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v40, 8, v27 ; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 @@ -200148,46 +200280,42 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v8 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v10 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v12 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v26 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v24 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 @@ -200196,34 +200324,37 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 -; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v22 -; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v26 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v28 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v2 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 @@ -200242,6 +200373,11 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 @@ -200250,12 +200386,8 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:36 -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 ; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v3 ; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 -; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v0 ; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:44 @@ -200264,47 +200396,45 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76 ; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:108 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:116 ; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:172 ; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:180 ; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:196 ; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:212 ; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:252 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:268 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:284 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:308 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:324 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:324 ; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill @@ -200314,46 +200444,50 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB97_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload @@ -200370,11 +200504,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -200398,6 +200531,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v17, v10 ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload @@ -200414,38 +200548,43 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v0, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v42 +; VI-NEXT: v_mov_b32_e32 v42, v44 +; VI-NEXT: v_mov_b32_e32 v44, v45 +; VI-NEXT: v_mov_b32_e32 v45, v62 +; VI-NEXT: v_or_b32_sdwa v2, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v53, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v34, v24 ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v36, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v37, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -200453,77 +200592,74 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v0, v38, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v39, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v48, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v45, v62 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v48, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v32, v1 ; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v54, v22 -; VI-NEXT: v_mov_b32_e32 v41, v24 ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v34, v0 +; VI-NEXT: v_mov_b32_e32 v33, v0 ; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v37, v1 -; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v55, v26 +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v50, v26 ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v39, v0 -; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v49, v1 -; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v43, v27 +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v51, v0 -; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v35, v1 -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v53, v28 +; VI-NEXT: v_mov_b32_e32 v53, v1 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v52, v28 ; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v47, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v0 -; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v57, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v47, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v36, v0 +; VI-NEXT: v_mov_b32_e32 v55, v0 +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v35, v0 ; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v41, v1 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v63, v27 +; VI-NEXT: v_mov_b32_e32 v46, v57 ; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v56, v0 +; VI-NEXT: v_mov_b32_e32 v36, v0 ; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v58, v1 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v61, v60 -; VI-NEXT: v_mov_b32_e32 v60, v59 +; VI-NEXT: v_mov_b32_e32 v56, v1 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v61, v59 ; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload @@ -200535,55 +200671,53 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v45, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v44, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v50, v0 +; VI-NEXT: v_mov_b32_e32 v58, v0 ; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v1, v62, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v52, v0 -; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v0 +; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v59, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v46, v1 -; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v60, v1 +; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v63, v0 -; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v0 +; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v47, v1 -; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v57, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 @@ -200615,12 +200749,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_cbranch_execnz .LBB97_3 ; VI-NEXT: .LBB97_2: ; %cmp.true -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v59 -; VI-NEXT: v_or_b32_sdwa v29, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 @@ -200639,165 +200771,147 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s9, s19, 8 ; VI-NEXT: s_add_i32 s16, s16, 3 ; VI-NEXT: s_lshl_b32 s10, s17, 8 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v26, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v28, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v59 +; VI-NEXT: v_or_b32_sdwa v25, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v62 -; VI-NEXT: v_or_b32_sdwa v28, v43, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 -; VI-NEXT: v_or_b32_sdwa v53, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v27, v63, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v45 -; VI-NEXT: v_or_b32_sdwa v27, v55, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v52, v43, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 +; VI-NEXT: v_or_b32_sdwa v26, v50, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 -; VI-NEXT: v_or_b32_sdwa v52, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v63, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v40 -; VI-NEXT: v_or_b32_sdwa v25, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v60 -; VI-NEXT: v_or_b32_sdwa v59, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v43, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v61 -; VI-NEXT: v_or_b32_sdwa v24, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v59, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v46 +; VI-NEXT: v_or_b32_sdwa v24, v56, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v48, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v48, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 ; VI-NEXT: v_or_b32_sdwa v24, v24, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: v_or_b32_sdwa v23, v41, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v38, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v38, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 ; VI-NEXT: v_or_b32_sdwa v23, v23, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v22, v54, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v22, v34, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v50, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v50 +; VI-NEXT: v_or_b32_sdwa v36, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v36 ; VI-NEXT: v_or_b32_sdwa v22, v22, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v22 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v21, v35, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v21, v53, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v54, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v53, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: v_or_b32_sdwa v20, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: v_or_b32_sdwa v49, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v49 ; VI-NEXT: v_or_b32_sdwa v20, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: v_or_b32_sdwa v19, v37, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v37, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v37, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v37 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v31, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v19, v19, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: v_or_b32_sdwa v18, v32, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v57, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v57 -; VI-NEXT: v_or_b32_sdwa v18, v18, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v58, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v58 +; VI-NEXT: v_or_b32_sdwa v18, v18, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v56, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v56, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v34, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v34 ; VI-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 @@ -200806,67 +200920,78 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v13, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v36, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v35, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v36 -; VI-NEXT: v_or_b32_sdwa v13, v13, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v26 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v52 -; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v54 -; VI-NEXT: v_or_b32_sdwa v21, v21, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v35 +; VI-NEXT: v_or_b32_sdwa v13, v13, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v25 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x300, v59 +; VI-NEXT: v_or_b32_sdwa v25, v43, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v28, v28, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v12, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v30, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v51, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v51 ; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v59 -; VI-NEXT: v_or_b32_sdwa v25, v25, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v1 +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v63 +; VI-NEXT: v_or_b32_sdwa v26, v26, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v30, v30, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v33, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v57, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v40, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_or_b32_sdwa v30, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v39, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v2 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -200890,15 +201015,14 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v10 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v55 ; VI-NEXT: v_or_b32_sdwa v10, v39, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v53 -; VI-NEXT: v_or_b32_sdwa v27, v28, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v28, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v29, v30, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v52 +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v53 +; VI-NEXT: v_or_b32_sdwa v21, v21, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v27, v27, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -200914,18 +201038,14 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v42 ; VI-NEXT: v_or_b32_sdwa v8, v8, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v11 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v40 -; VI-NEXT: v_or_b32_sdwa v11, v33, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v30, v31, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v17, v17, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v40 +; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v7, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload @@ -200965,19 +201085,29 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v46, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v29, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v46, vcc, 0x300, v46 ; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v2 +; VI-NEXT: v_or_b32_sdwa v29, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 ; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v47, vcc, 3, v32 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x300, v4 ; VI-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 @@ -201044,35 +201174,38 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB97_4: -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v61, v60 -; VI-NEXT: v_mov_b32_e32 v60, v59 +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v61, v59 +; VI-NEXT: v_mov_b32_e32 v46, v57 +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v40, v42 +; VI-NEXT: v_mov_b32_e32 v42, v44 +; VI-NEXT: v_mov_b32_e32 v44, v45 ; VI-NEXT: v_mov_b32_e32 v45, v62 -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v57, v5 +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v47, v4 -; VI-NEXT: v_mov_b32_e32 v63, v3 -; VI-NEXT: v_mov_b32_e32 v53, v28 -; VI-NEXT: v_mov_b32_e32 v43, v27 -; VI-NEXT: v_mov_b32_e32 v55, v26 -; VI-NEXT: v_mov_b32_e32 v41, v24 -; VI-NEXT: v_mov_b32_e32 v54, v22 +; VI-NEXT: v_mov_b32_e32 v54, v3 +; VI-NEXT: v_mov_b32_e32 v52, v28 +; VI-NEXT: v_mov_b32_e32 v63, v27 +; VI-NEXT: v_mov_b32_e32 v50, v26 +; VI-NEXT: v_mov_b32_e32 v34, v24 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_branch .LBB97_2 @@ -201134,18 +201267,18 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v7 ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v11 ; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; GFX9-NEXT: s_waitcnt vmcnt(24) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 ; GFX9-NEXT: s_waitcnt vmcnt(23) @@ -201174,10 +201307,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill @@ -201189,7 +201322,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v30 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill @@ -201237,7 +201370,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill @@ -201264,23 +201397,23 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 @@ -201293,48 +201426,49 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v3 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:52 ; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 ; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76 ; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:92 ; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:100 ; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:116 ; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:132 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:140 ; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:148 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:156 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:164 -; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204 ; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:220 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:228 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 ; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:244 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 -; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:260 ; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:276 ; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:284 ; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:308 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:316 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:324 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill @@ -201345,55 +201479,54 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(33) ; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(36) -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(39) +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(40) +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(40) +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: s_waitcnt vmcnt(40) ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: s_waitcnt vmcnt(40) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(40) ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill @@ -201403,7 +201536,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB97_2 @@ -201416,7 +201549,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_or_b32_sdwa v2, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -201453,10 +201586,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload @@ -201472,13 +201605,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -201486,7 +201619,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload @@ -201527,8 +201660,8 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v52, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_mov_b32_e32 v50, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -201546,16 +201679,16 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_mov_b32_e32 v48, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v1 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_mov_b32_e32 v33, v45 +; GFX9-NEXT: v_mov_b32_e32 v33, v46 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v18, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload @@ -201568,7 +201701,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -201577,7 +201710,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -201585,121 +201718,122 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v22, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v34, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v23, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_mov_b32_e32 v46, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v1, v35, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v24, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v35, v45 -; GFX9-NEXT: v_mov_b32_e32 v45, v61 -; GFX9-NEXT: v_mov_b32_e32 v61, v42 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_mov_b32_e32 v38, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshl_or_b32 v25, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v54, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v54, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v41, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v41, v57 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshl_or_b32 v27, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v60, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v60, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v28, v1, 16, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v59, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v57, v59 ; GFX9-NEXT: v_lshl_or_b32 v29, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v56, v42 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB97_3 ; GFX9-NEXT: .LBB97_2: ; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v33, v45 -; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v33, v46 +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v56, v61 +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: .LBB97_3: ; %Flow ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -201902,7 +202036,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -201962,11 +202096,11 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v48, v46, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v48, v40, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v49, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v49, v46, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 @@ -202001,7 +202135,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v53, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v61 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v62 ; GFX9-NEXT: v_or_b32_sdwa v24, v54, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v24 ; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v48 @@ -202010,7 +202144,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v54, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v45 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v61 ; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 ; GFX9-NEXT: v_or_b32_sdwa v20, v57, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v34, 0x300, v20 @@ -202019,7 +202153,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v26, 3, v56 ; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 -; GFX9-NEXT: v_or_b32_sdwa v21, v32, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v21, v45, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v28, 0x300, v21 ; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v54 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 @@ -205867,1192 +206001,1031 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v29 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v30 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v16 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v24 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v18 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v37 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr62 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; kill: killed $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; kill: killed $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; kill: killed $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; kill: killed $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr46 ; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: ; implicit-def: $vgpr44 ; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB98_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v16 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v16 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v14 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v12 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v12 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v11 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v10 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v10 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v9 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v8 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v8 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v7 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v6 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v6 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[15:16] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[13:14] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[11:12] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[9:10] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v31, v7 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v9, v10 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v9, v11 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v9, v12 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v9, v13 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v9, v14 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v9, v16 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v9, v8 -; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8] -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v7, v5 -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v7, v6 -; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[5:6] -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v4 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v3 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v5, v3 -; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] -; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v2 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v37 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v37 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v36 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v37 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[36:37] -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v5, v4 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v30 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v29 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v29 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v30 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[29:30] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v28 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v28 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v27 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v27 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v28 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[27:28] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v25 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v26 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[25:26] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v24 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v23 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v23 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v24 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v21 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v21 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v22 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v19 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v20 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v17 -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[19:20] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v18 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] -; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v46 -; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[17:18] -; VI-NEXT: v_mov_b32_e32 v32, v15 -; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v26 -; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v24 -; VI-NEXT: v_lshrrev_b32_e32 v58, 24, v22 -; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v20 -; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v20 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v19 -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v18 -; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v18 -; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v17 -; VI-NEXT: v_mov_b32_e32 v46, v1 -; VI-NEXT: ; implicit-def: $vgpr1 -; VI-NEXT: ; implicit-def: $vgpr3 -; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: ; implicit-def: $vgpr7 -; VI-NEXT: ; implicit-def: $vgpr9 -; VI-NEXT: ; implicit-def: $vgpr11 -; VI-NEXT: ; implicit-def: $vgpr13 -; VI-NEXT: ; implicit-def: $vgpr15 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr21 -; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v9 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v8 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v8 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v7 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v6 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v6 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v5 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v4 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v4 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v3 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v2 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v1 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[3:4] +; VI-NEXT: v_mov_b32_e32 v34, v47 +; VI-NEXT: v_lshrrev_b64 v[46:47], 24, v[1:2] +; VI-NEXT: v_mov_b32_e32 v47, v34 +; VI-NEXT: v_lshrrev_b64 v[34:35], 24, v[31:32] +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v34, v36 +; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] +; VI-NEXT: v_lshrrev_b64 v[38:39], 24, v[25:26] +; VI-NEXT: v_mov_b32_e32 v36, v34 +; VI-NEXT: v_mov_b32_e32 v34, v50 +; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[23:24] +; VI-NEXT: v_mov_b32_e32 v39, v62 +; VI-NEXT: v_lshrrev_b64 v[61:62], 24, v[21:22] +; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[7:8] +; VI-NEXT: v_mov_b32_e32 v50, v34 +; VI-NEXT: v_mov_b32_e32 v62, v39 +; VI-NEXT: v_mov_b32_e32 v34, v40 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[19:20] +; VI-NEXT: v_mov_b32_e32 v40, v34 +; VI-NEXT: v_mov_b32_e32 v34, v43 +; VI-NEXT: v_lshrrev_b64 v[42:43], 24, v[17:18] +; VI-NEXT: v_lshrrev_b32_e32 v60, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v48, 24, v20 +; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v18 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v18 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v17 +; VI-NEXT: v_mov_b32_e32 v43, v34 ; VI-NEXT: .LBB98_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB98_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v31, 3 -; VI-NEXT: v_add_u16_sdwa v51, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v32, 3, v18 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v18, v32, v18 -; VI-NEXT: v_add_u16_e32 v32, 3, v17 -; VI-NEXT: v_add_u16_sdwa v17, v17, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, 3 +; VI-NEXT: v_add_u16_e32 v34, 3, v18 +; VI-NEXT: v_add_u16_sdwa v18, v18, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; VI-NEXT: v_or_b32_e32 v35, v34, v18 +; VI-NEXT: v_add_u16_e32 v18, 3, v17 +; VI-NEXT: v_add_u16_sdwa v17, v17, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v17, v32, v17 -; VI-NEXT: v_add_u16_e32 v32, 3, v20 -; VI-NEXT: v_add_u16_sdwa v20, v20, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v20, v32, v20 -; VI-NEXT: v_add_u16_e32 v32, 3, v19 -; VI-NEXT: v_add_u16_sdwa v19, v19, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_add_u16_sdwa v48, v22, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v19, v32, v19 -; VI-NEXT: v_add_u16_e32 v32, 3, v22 -; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v48 -; VI-NEXT: v_add_u16_sdwa v53, v21, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v22, v32, v22 -; VI-NEXT: v_add_u16_e32 v32, 3, v21 -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 -; VI-NEXT: v_add_u16_sdwa v61, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v21, v32, v21 -; VI-NEXT: v_add_u16_e32 v32, 3, v24 -; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v61 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v24, v32, v24 -; VI-NEXT: v_add_u16_e32 v32, 3, v23 -; VI-NEXT: v_add_u16_sdwa v23, v23, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_add_u16_sdwa v58, v26, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v23, v32, v23 -; VI-NEXT: v_add_u16_e32 v32, 3, v26 -; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v58 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v26, v32, v26 -; VI-NEXT: v_add_u16_e32 v32, 3, v25 -; VI-NEXT: v_add_u16_sdwa v25, v25, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_add_u16_sdwa v39, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v25, v32, v25 -; VI-NEXT: v_add_u16_e32 v32, 3, v28 -; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v28, v32, v28 -; VI-NEXT: v_add_u16_e32 v32, 3, v27 -; VI-NEXT: v_add_u16_sdwa v27, v27, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_add_u16_sdwa v35, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v27, v32, v27 -; VI-NEXT: v_add_u16_e32 v33, 3, v30 -; VI-NEXT: v_add_u16_e32 v34, 3, v29 -; VI-NEXT: v_add_u16_sdwa v32, v29, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v35 -; VI-NEXT: v_add_u16_sdwa v52, v37, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v30, v33, v29 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v32 -; VI-NEXT: v_add_u16_e32 v33, 3, v37 -; VI-NEXT: v_add_u16_sdwa v50, v36, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v29, v34, v29 -; VI-NEXT: v_add_u16_e32 v34, 3, v36 -; VI-NEXT: v_or_b32_e32 v37, v33, v32 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v50 -; VI-NEXT: v_add_u16_sdwa v57, v2, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v36, v34, v32 -; VI-NEXT: v_add_u16_e32 v33, 3, v2 -; VI-NEXT: v_add_u16_e32 v34, 3, v1 -; VI-NEXT: v_add_u16_sdwa v32, v1, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 -; VI-NEXT: v_or_b32_e32 v2, v33, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 -; VI-NEXT: v_add_u16_sdwa v56, v4, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v1, v34, v1 -; VI-NEXT: v_add_u16_e32 v33, 3, v4 -; VI-NEXT: v_add_u16_e32 v34, 3, v3 -; VI-NEXT: v_add_u16_sdwa v32, v3, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 -; VI-NEXT: v_or_b32_e32 v4, v33, v3 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; VI-NEXT: v_add_u16_sdwa v47, v6, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v3, v34, v3 -; VI-NEXT: v_add_u16_e32 v33, 3, v6 -; VI-NEXT: v_add_u16_e32 v34, 3, v5 -; VI-NEXT: v_add_u16_sdwa v32, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 -; VI-NEXT: v_or_b32_e32 v6, v33, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v5, v34, v5 -; VI-NEXT: v_add_u16_sdwa v34, v8, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_add_u16_e32 v38, 3, v8 -; VI-NEXT: v_add_u16_e32 v33, 3, v7 -; VI-NEXT: v_add_u16_sdwa v32, v7, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 -; VI-NEXT: v_or_b32_e32 v8, v38, v7 -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 -; VI-NEXT: v_add_u16_sdwa v59, v10, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v7, v33, v7 -; VI-NEXT: v_add_u16_e32 v33, 3, v10 -; VI-NEXT: v_add_u16_e32 v38, 3, v9 -; VI-NEXT: v_add_u16_sdwa v32, v9, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v59 -; VI-NEXT: v_or_b32_e32 v10, v33, v9 -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 -; VI-NEXT: v_add_u16_sdwa v63, v12, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v9, v38, v9 -; VI-NEXT: v_add_u16_e32 v33, 3, v12 -; VI-NEXT: v_add_u16_e32 v38, 3, v11 -; VI-NEXT: v_add_u16_sdwa v32, v11, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v63 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v12, v33, v11 -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 -; VI-NEXT: v_add_u16_sdwa v33, v14, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v11, v38, v11 -; VI-NEXT: v_add_u16_e32 v38, 3, v14 -; VI-NEXT: v_add_u16_e32 v49, 3, v13 -; VI-NEXT: v_add_u16_sdwa v32, v13, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v33 -; VI-NEXT: v_add_u16_sdwa v60, v16, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v14, v38, v13 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 -; VI-NEXT: v_add_u16_sdwa v31, v15, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v34, v18, v17 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_sdwa v17, v20, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v34, 3, v20 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_or_b32_e32 v35, v34, v17 +; VI-NEXT: v_add_u16_sdwa v17, v19, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v20, 3, v19 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v34, v20, v17 +; VI-NEXT: v_add_u16_sdwa v17, v22, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v19, 3, v22 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_or_b32_e32 v61, v19, v17 +; VI-NEXT: v_add_u16_sdwa v17, v21, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v22, 3, v21 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_or_b32_e32 v60, v22, v17 +; VI-NEXT: v_add_u16_sdwa v17, v24, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v19, 3, v24 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_or_b32_e32 v63, v19, v17 +; VI-NEXT: v_add_u16_sdwa v17, v23, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v24, 3, v23 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_or_b32_e32 v62, v24, v17 +; VI-NEXT: v_add_u16_sdwa v17, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v19, 3, v26 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_or_b32_e32 v58, v19, v17 +; VI-NEXT: v_add_u16_sdwa v17, v25, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v26, 3, v25 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_or_b32_e32 v57, v26, v17 +; VI-NEXT: v_add_u16_sdwa v17, v28, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v19, 3, v28 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_or_b32_e32 v56, v19, v17 +; VI-NEXT: v_add_u16_sdwa v17, v27, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v28, 3, v27 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_or_b32_e32 v55, v28, v17 +; VI-NEXT: v_add_u16_sdwa v17, v30, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v21, 3, v30 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_sdwa v19, v29, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_add_u16_e32 v30, 3, v29 +; VI-NEXT: v_or_b32_e32 v40, v21, v17 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 +; VI-NEXT: v_or_b32_e32 v39, v30, v17 +; VI-NEXT: v_add_u16_sdwa v17, v32, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v45, 3, v32 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_sdwa v19, v31, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v32, 3, v31 +; VI-NEXT: v_or_b32_e32 v38, v45, v17 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 +; VI-NEXT: v_add_u16_sdwa v21, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v37, v32, v17 +; VI-NEXT: v_add_u16_e32 v17, 3, v2 +; VI-NEXT: v_add_u16_e32 v2, 3, v1 +; VI-NEXT: v_add_u16_sdwa v19, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; VI-NEXT: v_or_b32_e32 v49, v17, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; VI-NEXT: v_or_b32_e32 v48, v2, v1 +; VI-NEXT: v_add_u16_sdwa v1, v4, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v19, 3, v4 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v4, 3, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_or_b32_e32 v47, v19, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; VI-NEXT: v_or_b32_e32 v46, v4, v1 +; VI-NEXT: v_add_u16_sdwa v1, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v21, 3, v6 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_sdwa v3, v5, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_u16_e32 v6, 3, v5 +; VI-NEXT: v_or_b32_e32 v44, v21, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; VI-NEXT: v_or_b32_e32 v43, v6, v1 +; VI-NEXT: v_add_u16_sdwa v1, v8, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v23, 3, v8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_sdwa v3, v7, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_u16_e32 v8, 3, v7 +; VI-NEXT: v_or_b32_e32 v42, v23, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; VI-NEXT: v_or_b32_e32 v41, v8, v1 +; VI-NEXT: v_add_u16_sdwa v1, v10, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v25, 3, v10 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_sdwa v3, v9, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_u16_e32 v10, 3, v9 +; VI-NEXT: v_or_b32_e32 v54, v25, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; VI-NEXT: v_add_u16_sdwa v52, v12, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v53, v10, v1 +; VI-NEXT: v_add_u16_e32 v27, 3, v12 +; VI-NEXT: v_add_u16_sdwa v3, v11, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; VI-NEXT: v_add_u16_e32 v12, 3, v11 +; VI-NEXT: v_or_b32_e32 v51, v27, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; VI-NEXT: v_or_b32_e32 v50, v12, v1 +; VI-NEXT: v_add_u16_sdwa v1, v14, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v29, 3, v14 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_sdwa v3, v13, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_u16_e32 v14, 3, v13 +; VI-NEXT: v_or_b32_e32 v36, v29, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; VI-NEXT: v_add_u16_sdwa v59, v16, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v35, v14, v1 +; VI-NEXT: v_add_u16_sdwa v3, v15, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v16, 3, v16 -; VI-NEXT: v_add_u16_e32 v32, 3, v15 -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v60 -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v16, v16, v15 -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v15, v32, v15 -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v16 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[15:16] -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v13, v49, v13 -; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v14 -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v13 -; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[13:14] -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v11 -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v10 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v9 -; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[9:10] -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v7 -; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8] -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v6 -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v5 -; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[5:6] -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v3 -; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] -; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v37 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[36:37] -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v29 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[29:30] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v28 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v27 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[27:28] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[25:26] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v24 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v23 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v21 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v60, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v33, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v63, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v59, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v34, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v47, 8, 8 -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[19:20] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v56, 8, 8 -; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[17:18] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v57, 8, 8 -; VI-NEXT: v_mov_b32_e32 v46, v35 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v52, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v46, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v39, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v49, v53 -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v52, v51 -; VI-NEXT: v_bfe_u32 v31, v51, 8, 8 -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] -; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] -; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v20 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v19 -; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v18 -; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v17 -; VI-NEXT: v_bfe_u32 v35, v58, 8, 8 -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v39, v61, 8, 8 -; VI-NEXT: v_bfe_u32 v58, v48, 8, 8 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; VI-NEXT: v_add_u16_e32 v15, 3, v15 +; VI-NEXT: v_or_b32_e32 v34, v16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; VI-NEXT: v_or_b32_e32 v33, v15, v1 +; VI-NEXT: v_mov_b32_e32 v31, v32 +; VI-NEXT: v_mov_b32_e32 v32, v45 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v34 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, v17 +; VI-NEXT: v_mov_b32_e32 v3, v4 +; VI-NEXT: v_mov_b32_e32 v4, v19 +; VI-NEXT: v_mov_b32_e32 v5, v6 +; VI-NEXT: v_mov_b32_e32 v6, v21 +; VI-NEXT: v_mov_b32_e32 v7, v8 +; VI-NEXT: v_mov_b32_e32 v8, v23 +; VI-NEXT: v_mov_b32_e32 v9, v10 +; VI-NEXT: v_mov_b32_e32 v10, v25 +; VI-NEXT: v_mov_b32_e32 v11, v12 +; VI-NEXT: v_mov_b32_e32 v12, v27 +; VI-NEXT: v_mov_b32_e32 v13, v14 +; VI-NEXT: v_mov_b32_e32 v14, v29 +; VI-NEXT: v_mov_b32_e32 v17, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v19, v20 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v21, v22 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v23, v24 +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v25, v26 +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v27, v28 +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v29, v30 +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v33 +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[33:34] +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v36 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v35 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[35:36] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v51 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v50 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[50:51] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v54 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v53 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[53:54] +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v42 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v41 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v44 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v43 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v47 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v46 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v49 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v48 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v38 +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[43:44] +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v37 +; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[46:47] +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[34:35], 24, v[37:38] +; VI-NEXT: v_lshrrev_b64 v[46:47], 24, v[48:49] +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v40 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v39 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[34:35], 24, v[39:40] +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v56 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v55 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v58 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v57 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v63 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v62 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v61 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v60 +; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[55:56] +; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[62:63] +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[61:62], 24, v[60:61] +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b64 v[38:39], 24, v[57:58] +; VI-NEXT: v_mov_b32_e32 v57, v52 +; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[41:42] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v55 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v54 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[54:55] +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v34, v59, 8, 8 +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v54 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_bfe_u32 v61, v53, 8, 8 +; VI-NEXT: v_lshrrev_b64 v[42:43], 24, v[54:55] +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v55 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_bfe_u32 v34, v59, 8, 8 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v34, v52, 8, 8 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v34, v62, 8, 8 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v52, v34 +; VI-NEXT: v_bfe_u32 v34, v34, 8, 8 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v54, v34 +; VI-NEXT: v_bfe_u32 v34, v34, 8, 8 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v51, v34 +; VI-NEXT: v_bfe_u32 v34, v34, 8, 8 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v56, v34 +; VI-NEXT: v_bfe_u32 v60, v34, 8, 8 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_bfe_u32 v34, v34, 8, 8 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v34, v36, 8, 8 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v34, v50, 8, 8 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v34, v53, 8, 8 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v34, v40, 8, 8 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_bfe_u32 v34, v34, 8, 8 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_bfe_u32 v48, v34, 8, 8 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v63, v34 +; VI-NEXT: v_bfe_u32 v55, v34, 8, 8 ; VI-NEXT: .LBB98_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v43 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; VI-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; VI-NEXT: v_or_b32_sdwa v2, v2, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v46 +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v34, v46, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v60 +; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v40 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v45 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v44 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v41 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v33 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v55 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v45 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v58 +; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v62 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v31 -; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v37 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v55 +; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v38 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v44 -; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v48 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 -; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v41 -; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v58 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v49 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v54 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v39 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v38 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v35 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v35 +; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -209409,8 +209382,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s82, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(14) @@ -209441,9 +209414,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v43 ; SI-NEXT: v_writelane_b32 v62, s6, 0 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB99_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -209455,7 +209426,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v1, s56 ; SI-NEXT: s_and_b32 s4, s20, 0xffff ; SI-NEXT: s_lshl_b32 s5, s21, 16 -; SI-NEXT: v_alignbit_b32 v8, s57, v1, 24 +; SI-NEXT: v_alignbit_b32 v5, s57, v1, 24 ; SI-NEXT: v_alignbit_b32 v50, s57, v1, 16 ; SI-NEXT: v_alignbit_b32 v1, s57, v1, 8 ; SI-NEXT: s_or_b32 s46, s4, s5 @@ -209467,39 +209438,43 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v1, s46 ; SI-NEXT: s_and_b32 s4, s24, 0xffff ; SI-NEXT: s_lshl_b32 s5, s25, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, s47, v1, 24 +; SI-NEXT: v_alignbit_b32 v5, s47, v1, 24 ; SI-NEXT: s_or_b32 s44, s4, s5 ; SI-NEXT: s_and_b32 s4, s26, 0xffff ; SI-NEXT: s_lshl_b32 s5, s27, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, s47, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, s47, v1, 16 ; SI-NEXT: v_alignbit_b32 v51, s47, v1, 8 ; SI-NEXT: s_or_b32 s45, s4, s5 ; SI-NEXT: v_mov_b32_e32 v1, s44 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v5, s45, v1, 24 ; SI-NEXT: s_and_b32 s4, s28, 0xffff ; SI-NEXT: s_lshl_b32 s5, s29, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, s45, v1, 24 +; SI-NEXT: v_alignbit_b32 v5, s45, v1, 16 +; SI-NEXT: v_alignbit_b32 v1, s45, v1, 8 ; SI-NEXT: s_or_b32 s42, s4, s5 ; SI-NEXT: s_and_b32 s4, s82, 0xffff ; SI-NEXT: s_lshl_b32 s5, s81, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, s45, v1, 16 -; SI-NEXT: v_alignbit_b32 v49, s45, v1, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s42 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, s43, v1, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v5, s43, v1, 24 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v5, s43, v1, 16 +; SI-NEXT: v_alignbit_b32 v1, s43, v1, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, s43, v1, 16 -; SI-NEXT: v_alignbit_b32 v48, s43, v1, 8 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; SI-NEXT: s_and_b32 s4, s85, 0xffff ; SI-NEXT: s_lshl_b32 s5, s84, 16 @@ -209549,7 +209524,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_or_b32 s10, s4, s5 ; SI-NEXT: s_and_b32 s4, s75, 0xffff ; SI-NEXT: s_lshl_b32 s5, s74, 16 -; SI-NEXT: v_or_b32_e32 v12, v1, v5 +; SI-NEXT: v_or_b32_e32 v12, v1, v6 ; SI-NEXT: s_or_b32 s9, s4, s5 ; SI-NEXT: s_and_b32 s4, s77, 0xffff ; SI-NEXT: s_lshl_b32 s5, s76, 16 @@ -209573,7 +209548,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 ; SI-NEXT: v_writelane_b32 v62, s4, 1 ; SI-NEXT: s_lshr_b32 s4, s10, 8 -; SI-NEXT: v_or_b32_e32 v10, v1, v6 +; SI-NEXT: v_or_b32_e32 v10, v1, v8 ; SI-NEXT: v_writelane_b32 v62, s4, 3 ; SI-NEXT: s_lshr_b32 s4, s9, 8 ; SI-NEXT: v_alignbit_b32 v1, s14, v10, 24 @@ -209594,32 +209569,34 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 ; SI-NEXT: v_writelane_b32 v62, s4, 15 ; SI-NEXT: s_and_b32 s4, s72, 0xffff -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v28, v8 ; SI-NEXT: v_or_b32_e32 v8, v1, v9 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 ; SI-NEXT: v_writelane_b32 v62, s4, 2 ; SI-NEXT: s_and_b32 s4, s74, 0xffff +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v5, v1, v13 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 ; SI-NEXT: v_writelane_b32 v62, s4, 5 ; SI-NEXT: s_and_b32 s4, s76, 0xffff -; SI-NEXT: v_mov_b32_e32 v28, v13 +; SI-NEXT: v_mov_b32_e32 v30, v13 ; SI-NEXT: v_or_b32_e32 v13, v1, v17 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 ; SI-NEXT: v_writelane_b32 v62, s4, 8 ; SI-NEXT: s_and_b32 s4, s78, 0xffff -; SI-NEXT: v_mov_b32_e32 v26, v9 +; SI-NEXT: v_mov_b32_e32 v29, v9 ; SI-NEXT: v_or_b32_e32 v9, v1, v18 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 ; SI-NEXT: v_writelane_b32 v62, s4, 11 ; SI-NEXT: s_and_b32 s4, s88, 0xffff -; SI-NEXT: v_mov_b32_e32 v25, v6 +; SI-NEXT: v_mov_b32_e32 v26, v6 ; SI-NEXT: v_or_b32_e32 v6, v1, v20 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 ; SI-NEXT: v_writelane_b32 v62, s4, 14 ; SI-NEXT: s_bfe_u32 s4, s74, 0x80008 +; SI-NEXT: v_mov_b32_e32 v25, v4 ; SI-NEXT: v_or_b32_e32 v4, v1, v21 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 ; SI-NEXT: v_writelane_b32 v62, s4, 4 @@ -209631,12 +209608,12 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v1, v24 ; SI-NEXT: v_writelane_b32 v62, s4, 10 ; SI-NEXT: s_bfe_u32 s4, s88, 0x80008 -; SI-NEXT: v_mov_b32_e32 v29, v17 -; SI-NEXT: v_mov_b32_e32 v30, v18 -; SI-NEXT: v_mov_b32_e32 v36, v20 -; SI-NEXT: v_mov_b32_e32 v37, v21 -; SI-NEXT: v_mov_b32_e32 v38, v22 -; SI-NEXT: v_mov_b32_e32 v39, v24 +; SI-NEXT: v_mov_b32_e32 v36, v17 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: v_mov_b32_e32 v38, v20 +; SI-NEXT: v_mov_b32_e32 v39, v21 +; SI-NEXT: v_mov_b32_e32 v48, v22 +; SI-NEXT: v_mov_b32_e32 v49, v24 ; SI-NEXT: s_lshr_b32 s68, s57, 8 ; SI-NEXT: s_lshr_b32 s65, s47, 8 ; SI-NEXT: s_lshr_b32 s54, s45, 8 @@ -209697,9 +209674,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_alignbit_b32 v54, s6, v1, 8 ; SI-NEXT: s_cbranch_execnz .LBB99_3 ; SI-NEXT: .LBB99_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_and_b32 s4, s18, 0xffff ; SI-NEXT: s_lshl_b32 s5, s88, 16 @@ -209811,50 +209786,54 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v22, s47, v21, 24 -; SI-NEXT: s_lshr_b32 s4, s11, 8 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v22, s47, v21, 16 ; SI-NEXT: v_alignbit_b32 v51, s47, v21, 8 ; SI-NEXT: v_mov_b32_e32 v21, s44 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, s45, v21, 24 +; SI-NEXT: s_lshr_b32 s4, s11, 8 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, s45, v21, 16 +; SI-NEXT: v_alignbit_b32 v21, s45, v21, 8 ; SI-NEXT: v_writelane_b32 v62, s4, 1 ; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, s45, v21, 24 +; SI-NEXT: v_mov_b32_e32 v21, s42 ; SI-NEXT: v_writelane_b32 v62, s4, 2 ; SI-NEXT: s_lshr_b32 s4, s10, 8 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_or_b32_e32 v3, v16, v3 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, s45, v21, 16 -; SI-NEXT: v_alignbit_b32 v49, s45, v21, 8 -; SI-NEXT: v_mov_b32_e32 v21, s42 +; SI-NEXT: v_alignbit_b32 v22, s43, v21, 24 ; SI-NEXT: v_writelane_b32 v62, s4, 3 ; SI-NEXT: s_lshr_b32 s4, s9, 24 -; SI-NEXT: v_or_b32_e32 v5, v36, v5 +; SI-NEXT: v_or_b32_e32 v5, v38, v5 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v3 ; SI-NEXT: v_mov_b32_e32 v3, s41 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, s43, v21, 24 +; SI-NEXT: v_alignbit_b32 v22, s43, v21, 16 +; SI-NEXT: v_alignbit_b32 v21, s43, v21, 8 ; SI-NEXT: v_writelane_b32 v62, s4, 4 ; SI-NEXT: s_lshr_b32 s4, s9, 16 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v5 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v31 -; SI-NEXT: v_or_b32_e32 v7, v14, v7 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v7, v25, v7 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, s43, v21, 16 -; SI-NEXT: v_alignbit_b32 v48, s43, v21, 8 ; SI-NEXT: v_alignbit_b32 v21, v3, v16, 24 ; SI-NEXT: v_writelane_b32 v62, s4, 5 ; SI-NEXT: s_lshr_b32 s4, s9, 8 @@ -209868,7 +209847,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_alignbit_b32 v3, v3, v16, 8 ; SI-NEXT: v_writelane_b32 v62, s4, 6 ; SI-NEXT: s_lshr_b32 s4, s8, 24 -; SI-NEXT: v_or_b32_e32 v5, v30, v5 +; SI-NEXT: v_or_b32_e32 v5, v37, v5 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -209877,7 +209856,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s4, s8, 16 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v5 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v27 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_or_b32_e32 v11, v26, v11 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v3, v7, v14, 16 @@ -209892,7 +209871,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_alignbit_b32 v3, v7, v14, 8 ; SI-NEXT: v_writelane_b32 v62, s4, 9 ; SI-NEXT: s_lshr_b32 s4, s7, 24 -; SI-NEXT: v_or_b32_e32 v5, v29, v5 +; SI-NEXT: v_or_b32_e32 v5, v36, v5 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -209905,7 +209884,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v5 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v23 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v19 -; SI-NEXT: v_or_b32_e32 v10, v25, v10 +; SI-NEXT: v_or_b32_e32 v10, v28, v10 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v3, v11, v12, 16 @@ -209923,11 +209902,11 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_alignbit_b32 v3, v11, v12, 8 ; SI-NEXT: v_writelane_b32 v62, s4, 12 ; SI-NEXT: s_lshr_b32 s4, s6, 24 -; SI-NEXT: v_or_b32_e32 v1, v39, v1 -; SI-NEXT: v_or_b32_e32 v2, v38, v2 -; SI-NEXT: v_or_b32_e32 v4, v37, v4 -; SI-NEXT: v_or_b32_e32 v5, v28, v5 -; SI-NEXT: v_or_b32_e32 v8, v26, v8 +; SI-NEXT: v_or_b32_e32 v1, v49, v1 +; SI-NEXT: v_or_b32_e32 v2, v48, v2 +; SI-NEXT: v_or_b32_e32 v4, v39, v4 +; SI-NEXT: v_or_b32_e32 v5, v30, v5 +; SI-NEXT: v_or_b32_e32 v8, v29, v8 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v3, v15, v10, 24 @@ -210095,65 +210074,64 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_mov_b32_e32 v7, s4 ; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s4, s44, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v49 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v3, s4, v3 ; SI-NEXT: s_and_b32 s4, s45, 0xff -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s66, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshl_b32 s5, s51, 8 -; SI-NEXT: s_lshl_b32 s16, s67, 24 -; SI-NEXT: v_readlane_b32 s67, v63, 19 -; SI-NEXT: v_readlane_b32 s66, v63, 18 -; SI-NEXT: v_readlane_b32 s51, v63, 11 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s66, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 ; SI-NEXT: v_or_b32_e32 v3, v3, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_mov_b32_e32 v7, s4 ; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s4, s42, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v48 +; SI-NEXT: s_lshl_b32 s5, s51, 8 +; SI-NEXT: s_lshl_b32 s16, s67, 24 +; SI-NEXT: v_readlane_b32 s67, v63, 19 +; SI-NEXT: v_readlane_b32 s66, v63, 18 +; SI-NEXT: v_readlane_b32 s51, v63, 11 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v3, s4, v3 ; SI-NEXT: s_and_b32 s4, s43, 0xff -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s55, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshl_b32 s5, s48, 8 -; SI-NEXT: s_lshl_b32 s16, s64, 24 -; SI-NEXT: v_readlane_b32 s64, v63, 16 -; SI-NEXT: v_readlane_b32 s55, v63, 15 -; SI-NEXT: v_readlane_b32 s48, v63, 8 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s55, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 ; SI-NEXT: v_or_b32_e32 v3, v3, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 @@ -210163,16 +210141,21 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v3, 0xff, v16 ; SI-NEXT: s_and_b32 s4, s41, 0xff +; SI-NEXT: s_lshl_b32 s5, s48, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s52, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s64, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s16, s5 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_lshl_b32 s5, s37, 8 ; SI-NEXT: s_lshl_b32 s16, s53, 24 +; SI-NEXT: v_readlane_b32 s64, v63, 16 +; SI-NEXT: v_readlane_b32 s55, v63, 15 ; SI-NEXT: v_readlane_b32 s53, v63, 13 ; SI-NEXT: v_readlane_b32 s52, v63, 12 +; SI-NEXT: v_readlane_b32 s48, v63, 8 ; SI-NEXT: v_readlane_b32 s37, v63, 5 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) @@ -210532,55 +210515,62 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; kill: killed $sgpr6 ; SI-NEXT: ; implicit-def: $vcc_lo -; SI-NEXT: v_mov_b32_e32 v39, v24 +; SI-NEXT: v_mov_b32_e32 v49, v24 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: v_mov_b32_e32 v38, v22 +; SI-NEXT: v_mov_b32_e32 v48, v22 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: v_mov_b32_e32 v37, v21 +; SI-NEXT: v_mov_b32_e32 v39, v21 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: v_mov_b32_e32 v36, v20 +; SI-NEXT: v_mov_b32_e32 v38, v20 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: v_mov_b32_e32 v30, v18 +; SI-NEXT: v_mov_b32_e32 v37, v18 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: v_mov_b32_e32 v29, v17 +; SI-NEXT: v_mov_b32_e32 v36, v17 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: v_mov_b32_e32 v28, v13 +; SI-NEXT: v_mov_b32_e32 v30, v13 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: v_mov_b32_e32 v26, v9 +; SI-NEXT: v_mov_b32_e32 v29, v9 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: v_mov_b32_e32 v25, v6 +; SI-NEXT: v_mov_b32_e32 v28, v8 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: v_mov_b32_e32 v26, v6 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; kill: killed $vcc_lo ; SI-NEXT: ; implicit-def: $vcc_lo ; SI-NEXT: ; implicit-def: $sgpr56 @@ -210596,13 +210586,11 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr69 ; SI-NEXT: ; implicit-def: $sgpr80 ; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $sgpr45 ; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; implicit-def: $sgpr66 ; SI-NEXT: ; implicit-def: $sgpr70 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $sgpr43 ; SI-NEXT: ; implicit-def: $sgpr51 ; SI-NEXT: ; implicit-def: $sgpr55 @@ -210651,12 +210639,12 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; kill: killed $vcc_lo -; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; kill: killed $vcc_lo +; SI-NEXT: ; implicit-def: $vcc_lo ; SI-NEXT: ; kill: killed $vcc_lo ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr24 @@ -210683,7 +210671,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -212280,7 +212267,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v36 ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v13 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v36 ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v22 @@ -212288,7 +212275,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v35 ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v22 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v38 ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v22 @@ -212587,7 +212574,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: v_readlane_b32 s4, v62, 22 ; GFX9-NEXT: v_mov_b32_e32 v60, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 23 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v17, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 24 ; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill @@ -212595,7 +212582,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: v_readlane_b32 s4, v62, 25 ; GFX9-NEXT: v_mov_b32_e32 v23, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 26 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v17, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 27 ; GFX9-NEXT: v_mov_b32_e32 v59, s4 @@ -212865,14 +212852,14 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -217870,8 +217857,8 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -217895,403 +217882,431 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mul_f32_e32 v63, 1.0, v18 ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; SI-NEXT: v_mul_f32_e32 v46, 1.0, v22 ; SI-NEXT: v_mul_f32_e32 v47, 1.0, v23 ; SI-NEXT: v_mul_f32_e32 v56, 1.0, v24 ; SI-NEXT: v_mul_f32_e32 v57, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v28 ; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v16, 1.0, s18 ; SI-NEXT: v_mul_f32_e64 v17, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v18, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v22, 1.0, s22 ; SI-NEXT: v_mul_f32_e64 v23, 1.0, s23 ; SI-NEXT: v_mul_f32_e64 v24, 1.0, s24 ; SI-NEXT: v_mul_f32_e64 v25, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v27, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v30, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s28 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v30 ; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; SI-NEXT: v_mul_f32_e32 v33, 1.0, v33 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: v_mul_f32_e32 v34, 1.0, v34 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v36 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v38 +; SI-NEXT: s_waitcnt vmcnt(12) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(11) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(10) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v48 -; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v52 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v54 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v42 +; SI-NEXT: s_waitcnt vmcnt(10) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v37 +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v42 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v43 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v28, 1.0, s28 -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s29 +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB101_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mov_b32_e32 v42, v51 -; SI-NEXT: v_mov_b32_e32 v55, v50 -; SI-NEXT: v_mov_b32_e32 v40, v52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v55, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v20, v44 +; SI-NEXT: v_mov_b32_e32 v22, v21 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mov_b32_e32 v24, v47 ; SI-NEXT: v_mov_b32_e32 v23, v46 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_mov_b32_e32 v25, v56 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mov_b32_e32 v26, v57 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v27, v57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v28, v26 +; SI-NEXT: v_mov_b32_e32 v29, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v30, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v36, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v39 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_mov_b32_e32 v35, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mov_b32_e32 v38, v10 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mov_b32_e32 v19, v28 -; SI-NEXT: v_mov_b32_e32 v28, v14 -; SI-NEXT: v_mov_b32_e32 v39, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v19, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 +; SI-NEXT: v_mov_b32_e32 v44, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v46 +; SI-NEXT: v_mov_b32_e32 v46, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v63, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47 -; SI-NEXT: v_mov_b32_e32 v47, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v49 -; SI-NEXT: v_mov_b32_e32 v49, v15 -; SI-NEXT: v_mov_b32_e32 v15, v41 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v33, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v50 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v52 -; SI-NEXT: v_mov_b32_e32 v51, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v54 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_mov_b32_e32 v33, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v47, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v57 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v48, v10 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 -; SI-NEXT: v_mov_b32_e32 v37, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v2 -; SI-NEXT: v_mov_b32_e32 v27, v58 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v9 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_mov_b32_e32 v26, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v21, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v35 +; SI-NEXT: v_mov_b32_e32 v35, v55 +; SI-NEXT: v_mov_b32_e32 v55, v53 +; SI-NEXT: v_mov_b32_e32 v34, v43 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_mov_b32_e32 v32, v42 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v51 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v31 +; SI-NEXT: v_mov_b32_e32 v31, v36 +; SI-NEXT: v_mov_b32_e32 v36, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 +; SI-NEXT: v_mov_b32_e32 v37, v24 +; SI-NEXT: v_mov_b32_e32 v24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_mov_b32_e32 v38, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_mov_b32_e32 v40, v50 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(7) expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(6) expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v50 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v52 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v54 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v52 +; SI-NEXT: v_mov_b32_e32 v54, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 ; SI-NEXT: s_cbranch_execnz .LBB101_3 ; SI-NEXT: .LBB101_2: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v42 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v48, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v36 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 @@ -218300,10 +218315,10 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 @@ -218312,335 +218327,329 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v38, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v43 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v37, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v33, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v36, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v39 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v51 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v23 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v25 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v26 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v24 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v34, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v22 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v30, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v35, v55 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v39, v54 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v37, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v52 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v38, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v42 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v53 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v48 ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v58, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v49 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v43, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v33, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v51 ; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v43, v50 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v43, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: .LBB101_3: ; %end -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v53, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v14, v35 ; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v14, v14, v53 ; SI-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v15, v38 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 ; SI-NEXT: v_add_i32_e32 v16, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 @@ -218656,7 +218665,7 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 @@ -218665,57 +218674,61 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v37 ; SI-NEXT: v_add_i32_e32 v16, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v17 ; SI-NEXT: v_add_i32_e32 v16, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v14, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v19 ; SI-NEXT: v_add_i32_e32 v16, vcc, 28, v0 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v34 ; SI-NEXT: v_add_i32_e32 v16, vcc, 32, v0 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v19 ; SI-NEXT: v_add_i32_e32 v16, vcc, 36, v0 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v26 ; SI-NEXT: v_add_i32_e32 v16, vcc, 40, v0 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v33 ; SI-NEXT: v_add_i32_e32 v16, vcc, 44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -218723,13 +218736,13 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v21 ; SI-NEXT: v_add_i32_e32 v16, vcc, 48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v14, v59 ; SI-NEXT: v_add_i32_e32 v16, vcc, 52, v0 @@ -218738,25 +218751,21 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v14, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v44 ; SI-NEXT: v_add_i32_e32 v16, vcc, 56, v0 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v14, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v46 ; SI-NEXT: v_add_i32_e32 v16, vcc, 60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v14, v62 ; SI-NEXT: v_add_i32_e32 v16, vcc, 64, v0 @@ -218767,14 +218776,16 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v14, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v47 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v46 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v1, v14, v1 ; SI-NEXT: buffer_store_dword v1, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -218784,21 +218795,23 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v14, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 @@ -218807,13 +218820,11 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload @@ -218825,20 +218836,20 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload @@ -218850,33 +218861,35 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -218901,20 +218914,17 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB101_4: -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v55, v53 +; SI-NEXT: v_mov_b32_e32 v30, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: v_mov_b32_e32 v40, v52 -; SI-NEXT: v_mov_b32_e32 v55, v50 -; SI-NEXT: v_mov_b32_e32 v42, v51 -; SI-NEXT: v_mov_b32_e32 v28, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -218938,70 +218948,75 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_mov_b32_e32 v27, v58 -; SI-NEXT: v_mov_b32_e32 v26, v57 +; SI-NEXT: v_mov_b32_e32 v40, v50 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v36, v54 +; SI-NEXT: v_mov_b32_e32 v29, v58 +; SI-NEXT: v_mov_b32_e32 v28, v26 +; SI-NEXT: v_mov_b32_e32 v27, v57 ; SI-NEXT: v_mov_b32_e32 v25, v56 -; SI-NEXT: v_mov_b32_e32 v24, v47 ; SI-NEXT: v_mov_b32_e32 v23, v46 +; SI-NEXT: v_mov_b32_e32 v22, v21 +; SI-NEXT: v_mov_b32_e32 v20, v44 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; kill: killed $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; kill: killed $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; kill: killed $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; kill: killed $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr43 @@ -221580,10 +221595,11 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v63, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v63 +; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v63 ; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 @@ -221704,7 +221720,6 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -221945,9 +221960,9 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 ; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v62 ; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill @@ -222613,16 +222628,15 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v6 ; SI-NEXT: v_mov_b32_e32 v46, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v43, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 @@ -222635,588 +222649,599 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v8, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v24, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v29, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s23 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v48 -; SI-NEXT: s_waitcnt vmcnt(11) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v47, v39 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v44, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v37, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v48, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v49, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v50, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v51, s29 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v50, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v32, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s29 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB103_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v24 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v44 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v25 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v59 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v29 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v6 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v15 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v49 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v24, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v60 +; SI-NEXT: v_mov_b32_e32 v39, v30 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v36 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v40 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v43 -; SI-NEXT: v_mov_b32_e32 v43, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v43 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v20 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_mov_b32_e32 v50, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v56 +; SI-NEXT: v_mov_b32_e32 v23, v17 +; SI-NEXT: v_mov_b32_e32 v25, v20 +; SI-NEXT: v_mov_b32_e32 v29, v21 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v51, v22 -; SI-NEXT: v_mov_b32_e32 v38, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_mov_b32_e32 v37, v45 -; SI-NEXT: v_mov_b32_e32 v27, v26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 -; SI-NEXT: v_mov_b32_e32 v49, v47 -; SI-NEXT: v_mov_b32_e32 v35, v28 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v22 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v45 +; SI-NEXT: v_mov_b32_e32 v49, v16 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v16 +; SI-NEXT: v_mov_b32_e32 v35, v46 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v57 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v46 +; SI-NEXT: v_mov_b32_e32 v43, v59 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v58 +; SI-NEXT: v_mov_b32_e32 v32, v28 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v52 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v50 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v36 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v46 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 -; SI-NEXT: v_mov_b32_e32 v57, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 -; SI-NEXT: v_mov_b32_e32 v32, v7 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 -; SI-NEXT: v_mov_b32_e32 v33, v12 -; SI-NEXT: v_mov_b32_e32 v34, v5 -; SI-NEXT: v_mov_b32_e32 v58, v7 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v44 -; SI-NEXT: v_mov_b32_e32 v44, v18 -; SI-NEXT: v_mov_b32_e32 v5, v43 -; SI-NEXT: v_mov_b32_e32 v18, v6 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v3 ; SI-NEXT: s_branch .LBB103_3 ; SI-NEXT: .LBB103_2: -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: v_mov_b32_e32 v39, v30 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: v_mov_b32_e32 v32, v28 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: v_mov_b32_e32 v35, v46 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: v_mov_b32_e32 v49, v16 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: v_mov_b32_e32 v51, v22 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: v_mov_b32_e32 v29, v21 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: v_mov_b32_e32 v25, v20 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: v_mov_b32_e32 v24, v19 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: v_mov_b32_e32 v23, v17 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: v_mov_b32_e32 v35, v28 -; SI-NEXT: v_mov_b32_e32 v49, v47 -; SI-NEXT: v_mov_b32_e32 v27, v26 -; SI-NEXT: v_mov_b32_e32 v37, v45 -; SI-NEXT: v_mov_b32_e32 v38, v16 -; SI-NEXT: v_mov_b32_e32 v51, v22 -; SI-NEXT: v_mov_b32_e32 v50, v19 +; SI-NEXT: ; kill: killed $vgpr5 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: v_mov_b32_e32 v5, v6 ; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: v_mov_b32_e32 v43, v59 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; kill: killed $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v5, v57 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: .LBB103_3: ; %Flow -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v36, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v43, v9 -; SI-NEXT: v_mov_b32_e32 v12, v31 ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v31, v11 -; SI-NEXT: v_mov_b32_e32 v9, v17 +; SI-NEXT: v_mov_b32_e32 v47, v49 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v50, v2 ; SI-NEXT: s_cbranch_vccnz .LBB103_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v62 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v26, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 +; SI-NEXT: v_mov_b32_e32 v1, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v24 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v25 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v10 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v15, v54 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v10, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v42 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v41 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v15 -; SI-NEXT: v_mov_b32_e32 v6, v37 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v52 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v51 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v60 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v50 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v5 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v18 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v44 +; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v8 ; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 ; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v34, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v51, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v40, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v46, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v47, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v57, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v10, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 ; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v57 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v58, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v26, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v22, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v35, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v59 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v60, 0x38000000, v60 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v61, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v62 ; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v63, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v63 ; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -223262,14 +223287,22 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v57 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v26 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v57 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill @@ -223293,7 +223326,7 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v42 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v41 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -223328,99 +223361,92 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v31 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v16 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v14 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v15 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v48 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v14 -; SI-NEXT: v_mov_b32_e32 v16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v15 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v37 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v30 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v49 +; SI-NEXT: v_mov_b32_e32 v8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v53 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v54 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v48 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v4 +; SI-NEXT: v_mov_b32_e32 v4, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 -; SI-NEXT: v_mov_b32_e32 v4, v27 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v3 -; SI-NEXT: v_mov_b32_e32 v3, v13 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v2 ; SI-NEXT: .LBB103_5: ; %end ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -223510,7 +223536,7 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -223571,17 +223597,19 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -223591,8 +223619,8 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -223601,7 +223629,7 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -223611,102 +223639,100 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v56 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v45 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v61 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v57 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v59 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v60 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v27 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v46 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v45 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v28 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v17 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v16 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v14 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -224058,17 +224084,17 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 @@ -224087,169 +224113,170 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v1 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v35 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v37 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v44 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:120 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v44 ; SI-NEXT: v_mul_f32_e32 v44, 1.0, v46 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v38 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v49 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v40 ; SI-NEXT: v_mul_f32_e32 v46, 1.0, v58 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v59 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v59 ; SI-NEXT: v_mul_f32_e32 v6, 1.0, v41 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v42 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v45 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v47 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; kill: killed $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v47 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56 ; SI-NEXT: v_mul_f32_e32 v47, 1.0, v57 ; SI-NEXT: v_mul_f32_e32 v60, 1.0, v60 ; SI-NEXT: v_mul_f32_e32 v57, 1.0, v62 ; SI-NEXT: v_mul_f32_e32 v56, 1.0, v63 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v43 ; SI-NEXT: ; kill: killed $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -224264,106 +224291,96 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_mul_f32_e32 v59, 1.0, v1 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_mul_f32_e32 v58, 1.0, v3 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v22 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v23 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v24 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:132 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v30 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v22 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v34 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB104_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v34 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v43 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v33 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v30 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v23 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 @@ -224372,14 +224389,10 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v47 ; SI-NEXT: ; kill: killed $vgpr1 @@ -224391,7 +224404,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v61 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill @@ -224399,7 +224412,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v60 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v57 ; SI-NEXT: ; kill: killed $vgpr1 @@ -224411,265 +224424,277 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v59 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v25 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v40 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v44 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v22 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v40 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v55 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v32 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v54 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v19 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v13 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v21 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v18 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v9 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v31 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v32 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v55 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v31 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v17 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v54 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v53 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v53 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: .LBB104_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB104_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v33 -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v33 -; SI-NEXT: v_alignbit_b32 v22, v34, v22, 16 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v28, v34, v28, 16 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v54 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v32 -; SI-NEXT: v_alignbit_b32 v22, v34, v22, 16 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v34 +; SI-NEXT: v_alignbit_b32 v28, v35, v28, 16 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v31 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v31 -; SI-NEXT: v_alignbit_b32 v22, v34, v22, 16 -; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v53 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v54 -; SI-NEXT: v_add_f32_e32 v51, 0x40c00000, v34 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v51 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v31 +; SI-NEXT: v_alignbit_b32 v28, v32, v28, 16 ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_alignbit_b32 v22, v34, v22, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: v_alignbit_b32 v15, v22, v15, 16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v19 +; SI-NEXT: v_alignbit_b32 v28, v32, v28, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; SI-NEXT: v_alignbit_b32 v15, v19, v15, 16 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; SI-NEXT: v_alignbit_b32 v13, v28, v13, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 -; SI-NEXT: v_alignbit_b32 v13, v18, v13, 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: v_alignbit_b32 v13, v21, v13, 16 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v20 ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v13 +; SI-NEXT: v_alignbit_b32 v11, v20, v11, 16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_alignbit_b32 v11, v19, v11, 16 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_alignbit_b32 v13, v17, v13, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_alignbit_b32 v10, v13, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, v17, v11, 16 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_alignbit_b32 v8, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v8, v11, v8, 16 ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill @@ -224680,254 +224705,256 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v6, v8, v6, 16 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; SI-NEXT: v_alignbit_b32 v4, v6, v4, 16 ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v60 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v61 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 ; SI-NEXT: v_alignbit_b32 v4, v8, v4, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v16 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_alignbit_b32 v4, v10, v4, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v24 +; SI-NEXT: v_alignbit_b32 v4, v11, v4, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v61, v1, v3, 16 +; SI-NEXT: v_alignbit_b32 v19, v1, v3, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v58 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v19, v10, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v27 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v20, v11, v3, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v4, v13, v4, 16 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_alignbit_b32 v4, v16, v4, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v26 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v20, v10, v8, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v21, v11, v8, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v47 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_alignbit_b32 v4, v17, v4, 16 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_alignbit_b32 v4, v16, v4, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v21 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v21, v10, v8, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v44 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v22 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v22, v11, v8, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v23, v10, v8, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v23, v11, v8, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v24 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v24, v11, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v18, v16, v4, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v13 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_alignbit_b32 v45, v17, v4, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_alignbit_b32 v7, v24, v7, 16 ; SI-NEXT: v_alignbit_b32 v5, v23, v5, 16 -; SI-NEXT: v_alignbit_b32 v2, v21, v2, 16 -; SI-NEXT: v_alignbit_b32 v1, v61, v1, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v2, v22, v2, 16 +; SI-NEXT: v_alignbit_b32 v1, v19, v1, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v24, v10, v8, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v25, v11, v8, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v7, v24, v7, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v8, v25, v8, 16 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v17 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_alignbit_b32 v25, v45, v8, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v8, v25, v8, 16 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_alignbit_b32 v62, v63, v16, 16 ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 ; SI-NEXT: v_alignbit_b32 v16, v62, v16, 16 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v22, v34, v9, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v9, v22, v9, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_alignbit_b32 v37, v38, v11, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v11, v37, v11, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v35, v36, v11, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_alignbit_b32 v39, v48, v12, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v35, v36, v10, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v12, v39, v12, 16 -; SI-NEXT: v_alignbit_b32 v10, v35, v10, 16 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v37, v38, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v10, v37, v10, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v35, v11, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v39, v48, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v12, v39, v12, 16 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_alignbit_b32 v49, v50, v13, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_alignbit_b32 v28, v43, v9, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v41, v42, v15, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v34 +; SI-NEXT: v_alignbit_b32 v15, v41, v15, 16 ; SI-NEXT: v_alignbit_b32 v13, v49, v13, 16 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v28, v9, 16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_alignbit_b32 v51, v52, v14, 16 ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v31 ; SI-NEXT: v_alignbit_b32 v14, v51, v14, 16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v41, v42, v15, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v32 -; SI-NEXT: v_alignbit_b32 v15, v41, v15, 16 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v2, v21, v6, 16 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v20, v6, 16 +; SI-NEXT: v_alignbit_b32 v2, v20, v3, 16 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v19, v3, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v45, v4, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v1, v18, v4, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: .LBB104_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -224942,7 +224969,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -224958,7 +224985,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -224974,7 +225001,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -224990,7 +225017,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -225006,7 +225033,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -225022,7 +225049,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -225038,7 +225065,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -225047,14 +225074,14 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -225064,15 +225091,17 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -225091,8 +225120,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -225111,8 +225140,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -225132,7 +225161,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -225151,8 +225180,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -225171,8 +225200,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -225180,9 +225209,11 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -225190,7 +225221,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -225198,12 +225229,10 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v45 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -227564,282 +227593,892 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; SI-NEXT: v_mul_f32_e32 v44, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v11 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v12 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v45, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v25 ; SI-NEXT: v_mul_f32_e32 v22, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v28 ; SI-NEXT: v_mul_f32_e32 v26, 1.0, v29 ; SI-NEXT: v_mul_f32_e32 v23, 1.0, v30 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 ; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s28 -; SI-NEXT: v_mul_f32_e64 v24, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v32 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v33 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v34 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v34 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: v_mul_f32_e32 v27, 1.0, v35 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v38 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v48 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v51 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v50 -; SI-NEXT: s_waitcnt vmcnt(10) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v51 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v52 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v53 -; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v52 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v53 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v54 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_mul_f32_e32 v49, 1.0, v55 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v41 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v42 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s17 +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v42 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v41 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s17 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB105_2 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB105_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v31 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v53 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v14 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v24 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v5 -; SI-NEXT: v_mov_b32_e32 v42, v62 -; SI-NEXT: v_mov_b32_e32 v43, v63 -; SI-NEXT: v_mov_b32_e32 v55, v12 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v25 -; SI-NEXT: v_mov_b32_e32 v25, v60 -; SI-NEXT: v_mov_b32_e32 v54, v47 -; SI-NEXT: v_mov_b32_e32 v40, v20 -; SI-NEXT: v_mov_b32_e32 v51, v61 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v46 -; SI-NEXT: v_mov_b32_e32 v29, v31 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v39 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v11, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v47 +; SI-NEXT: v_mov_b32_e32 v39, v13 +; SI-NEXT: v_mov_b32_e32 v31, v6 +; SI-NEXT: v_mov_b32_e32 v29, v61 ; SI-NEXT: v_mov_b32_e32 v24, v56 +; SI-NEXT: v_mov_b32_e32 v38, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v57 +; SI-NEXT: v_mov_b32_e32 v25, v60 ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_mov_b32_e32 v52, v10 -; SI-NEXT: v_mov_b32_e32 v53, v59 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_mov_b32_e32 v40, v58 +; SI-NEXT: v_mov_b32_e32 v52, v59 +; SI-NEXT: v_mov_b32_e32 v43, v37 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v48 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v49 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mov_b32_e32 v62, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v50, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_mov_b32_e32 v62, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v63 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v41, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v59 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v32 -; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v39 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mov_b32_e32 v41, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v47 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v56 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v54 +; SI-NEXT: v_mov_b32_e32 v54, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v53 +; SI-NEXT: v_mov_b32_e32 v53, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v58 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v34 +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v33 ; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: s_cbranch_execnz .LBB105_3 +; SI-NEXT: .LBB105_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v52 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v34 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 -; SI-NEXT: v_mov_b32_e32 v39, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v37 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v1, v9, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v1, v11, v1, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; SI-NEXT: v_alignbit_b32 v1, v14, v1, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 +; SI-NEXT: v_alignbit_b32 v1, v16, v1, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v48 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v37, v38 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 +; SI-NEXT: v_alignbit_b32 v1, v18, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v53, v20, v1, 16 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v39 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; SI-NEXT: v_alignbit_b32 v54, v22, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v23 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_alignbit_b32 v1, v23, v1, 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; SI-NEXT: v_alignbit_b32 v1, v26, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v27 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v29 +; SI-NEXT: v_alignbit_b32 v17, v27, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v25 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v31 +; SI-NEXT: v_alignbit_b32 v28, v28, v1, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v24 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v34 -; SI-NEXT: s_branch .LBB105_3 -; SI-NEXT: .LBB105_2: -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v30 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v32 +; SI-NEXT: v_alignbit_b32 v30, v30, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v36, v35, v1, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v32 +; SI-NEXT: v_alignbit_b32 v48, v49, v1, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v31 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v29 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v26, v58, v1, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v23, v27, v1, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v21, v23, v21, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v56, v59, v1, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v12, v22, v1, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v63, v1, v20, 16 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_alignbit_b32 v47, v63, v19, 16 +; SI-NEXT: v_alignbit_b32 v19, v56, v13, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v24 +; SI-NEXT: v_alignbit_b32 v60, v18, v20, 16 +; SI-NEXT: v_alignbit_b32 v16, v60, v16, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v29 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v61, v7, v25, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v33 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v57, v7, v31, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v31 +; SI-NEXT: v_alignbit_b32 v24, v46, v11, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v11 +; SI-NEXT: v_alignbit_b32 v25, v45, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v44, v15, v3, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_alignbit_b32 v9, v3, v9, 16 +; SI-NEXT: v_alignbit_b32 v2, v20, v2, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v50 +; SI-NEXT: v_alignbit_b32 v50, v51, v39, 16 +; SI-NEXT: v_alignbit_b32 v4, v50, v4, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v4, v9, v5, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v4, v2, v6, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v4, v44, v38, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v4, v25, v37, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v4, v24, v34, 16 +; SI-NEXT: v_mov_b32_e32 v7, v25 +; SI-NEXT: v_mov_b32_e32 v37, v17 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v4, v57, v33, 16 +; SI-NEXT: v_alignbit_b32 v6, v61, v14, 16 +; SI-NEXT: v_alignbit_b32 v17, v12, v32, 16 +; SI-NEXT: v_alignbit_b32 v14, v26, v10, 16 +; SI-NEXT: v_alignbit_b32 v32, v48, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v36, v52, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: .LBB105_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v51 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v8, v4 +; SI-NEXT: v_add_i32_e32 v8, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_or_b32_e32 v4, v4, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_or_b32_e32 v4, v4, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v57 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB105_4: +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v39, v13 ; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -227871,695 +228510,73 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v5 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: v_mov_b32_e32 v51, v61 -; SI-NEXT: v_mov_b32_e32 v42, v62 -; SI-NEXT: v_mov_b32_e32 v29, v31 +; SI-NEXT: v_mov_b32_e32 v11, v5 +; SI-NEXT: v_mov_b32_e32 v31, v6 +; SI-NEXT: v_mov_b32_e32 v29, v61 ; SI-NEXT: v_mov_b32_e32 v25, v60 ; SI-NEXT: v_mov_b32_e32 v24, v56 -; SI-NEXT: v_mov_b32_e32 v54, v47 -; SI-NEXT: v_mov_b32_e32 v40, v20 -; SI-NEXT: v_mov_b32_e32 v43, v63 -; SI-NEXT: v_mov_b32_e32 v52, v10 -; SI-NEXT: v_mov_b32_e32 v53, v59 -; SI-NEXT: v_mov_b32_e32 v39, v4 -; SI-NEXT: v_mov_b32_e32 v37, v38 -; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v47 +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_mov_b32_e32 v40, v58 +; SI-NEXT: v_mov_b32_e32 v52, v59 +; SI-NEXT: v_mov_b32_e32 v38, v4 +; SI-NEXT: v_mov_b32_e32 v43, v37 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: .LBB105_3: ; %Flow -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_cbranch_vccnz .LBB105_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v55 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v39 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v37 -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v32 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v34 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_alignbit_b32 v1, v9, v1, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v50 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: v_alignbit_b32 v1, v11, v1, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v42 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 -; SI-NEXT: v_alignbit_b32 v1, v14, v1, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 -; SI-NEXT: v_alignbit_b32 v51, v16, v1, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v54 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 -; SI-NEXT: v_alignbit_b32 v1, v18, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v29 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v31 -; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v9 -; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v3 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v18, v20, v1, 16 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v7 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; SI-NEXT: v_alignbit_b32 v1, v22, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v1, v23, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v1, v26, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v29 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v33 -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v1, v27, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v25 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v24 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v1, v28, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v52, v30, v1, 16 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v36, v35, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v32 -; SI-NEXT: v_alignbit_b32 v48, v49, v1, 16 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v25 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v20 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v28, v59, v1, 16 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v26, v28, v26, 16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_alignbit_b32 v46, v61, v31, 16 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v21, v30, v1, 16 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v23, v10, v1, 16 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v63, v23, v27, 16 -; SI-NEXT: v_alignbit_b32 v27, v21, v12, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v57, v58, v1, 16 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v17, v1, v20, 16 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_alignbit_b32 v19, v17, v19, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v24 -; SI-NEXT: v_alignbit_b32 v56, v47, v20, 16 -; SI-NEXT: v_alignbit_b32 v20, v62, v11, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v16, v56, v16, 16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v22, v45, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v29 -; SI-NEXT: v_alignbit_b32 v13, v60, v25, 16 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 -; SI-NEXT: v_alignbit_b32 v24, v44, v3, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v9, v11, v9, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v15 -; SI-NEXT: v_mov_b32_e32 v15, v24 -; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v3, v39, 16 -; SI-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v4, v9, v5, 16 -; SI-NEXT: v_alignbit_b32 v5, v36, v7, 16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v4, v2, v6, 16 -; SI-NEXT: v_alignbit_b32 v6, v46, v33, 16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v4, v24, v38, 16 -; SI-NEXT: v_alignbit_b32 v38, v48, v8, 16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v4, v22, v37, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v57, v32, 16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v4, v20, v34, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v20, v52 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v4, v13, v14, 16 -; SI-NEXT: v_mov_b32_e32 v14, v51 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: .LBB105_5: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_or_b32_e32 v4, v7, v4 -; SI-NEXT: v_add_i32_e32 v7, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v46 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v56 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: s_branch .LBB105_2 ; ; VI-LABEL: bitcast_v64bf16_to_v64i16_scalar: ; VI: ; %bb.0: @@ -231641,50 +231658,50 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_mov_b32 s60, s16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v41, s17, 0 -; SI-NEXT: s_mov_b32 s61, s19 ; SI-NEXT: v_writelane_b32 v41, s60, 1 -; SI-NEXT: s_mov_b32 s63, s18 -; SI-NEXT: v_writelane_b32 v41, s61, 2 +; SI-NEXT: s_mov_b32 s61, s18 +; SI-NEXT: v_writelane_b32 v41, s19, 2 ; SI-NEXT: s_mov_b32 s72, s21 -; SI-NEXT: v_writelane_b32 v41, s63, 3 +; SI-NEXT: v_writelane_b32 v41, s61, 3 ; SI-NEXT: v_writelane_b32 v41, s72, 4 ; SI-NEXT: s_mov_b32 s74, s23 ; SI-NEXT: v_writelane_b32 v41, s20, 5 ; SI-NEXT: v_writelane_b32 v41, s74, 6 -; SI-NEXT: s_mov_b32 s75, s25 +; SI-NEXT: s_mov_b32 s76, s25 ; SI-NEXT: v_writelane_b32 v41, s22, 7 -; SI-NEXT: v_writelane_b32 v41, s75, 8 -; SI-NEXT: s_mov_b32 s76, s27 +; SI-NEXT: v_writelane_b32 v41, s76, 8 +; SI-NEXT: s_mov_b32 s78, s27 ; SI-NEXT: v_writelane_b32 v41, s24, 9 -; SI-NEXT: v_writelane_b32 v41, s76, 10 -; SI-NEXT: s_mov_b32 s93, s29 +; SI-NEXT: v_writelane_b32 v41, s78, 10 +; SI-NEXT: s_mov_b32 s79, s29 ; SI-NEXT: v_writelane_b32 v41, s26, 11 -; SI-NEXT: v_writelane_b32 v41, s93, 12 -; SI-NEXT: v_readfirstlane_b32 s16, v2 +; SI-NEXT: v_writelane_b32 v41, s79, 12 +; SI-NEXT: v_readfirstlane_b32 s6, v2 ; SI-NEXT: v_writelane_b32 v41, s28, 13 ; SI-NEXT: v_readfirstlane_b32 s73, v4 -; SI-NEXT: v_writelane_b32 v41, s16, 14 -; SI-NEXT: v_readfirstlane_b32 s89, v3 +; SI-NEXT: v_writelane_b32 v41, s6, 14 +; SI-NEXT: v_readfirstlane_b32 s95, v3 ; SI-NEXT: v_writelane_b32 v41, s73, 15 -; SI-NEXT: v_readfirstlane_b32 s90, v6 -; SI-NEXT: v_writelane_b32 v41, s89, 16 -; SI-NEXT: v_readfirstlane_b32 s91, v5 -; SI-NEXT: v_writelane_b32 v41, s90, 17 -; SI-NEXT: v_readfirstlane_b32 s34, v8 -; SI-NEXT: v_writelane_b32 v41, s91, 18 -; SI-NEXT: v_readfirstlane_b32 s35, v7 -; SI-NEXT: v_writelane_b32 v41, s34, 19 -; SI-NEXT: v_readfirstlane_b32 s36, v10 -; SI-NEXT: v_writelane_b32 v41, s35, 20 +; SI-NEXT: v_readfirstlane_b32 s36, v6 +; SI-NEXT: v_writelane_b32 v41, s95, 16 +; SI-NEXT: v_readfirstlane_b32 s30, v5 +; SI-NEXT: v_writelane_b32 v41, s36, 17 +; SI-NEXT: v_readfirstlane_b32 s37, v8 +; SI-NEXT: v_writelane_b32 v41, s30, 18 +; SI-NEXT: v_readfirstlane_b32 s38, v7 +; SI-NEXT: v_writelane_b32 v41, s37, 19 +; SI-NEXT: v_readfirstlane_b32 s39, v10 +; SI-NEXT: v_writelane_b32 v41, s38, 20 ; SI-NEXT: v_writelane_b32 v40, s96, 32 -; SI-NEXT: v_readfirstlane_b32 s37, v9 -; SI-NEXT: v_writelane_b32 v41, s36, 21 +; SI-NEXT: v_readfirstlane_b32 s48, v9 +; SI-NEXT: v_writelane_b32 v41, s39, 21 +; SI-NEXT: v_writelane_b32 v40, s97, 33 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s62, v31 +; SI-NEXT: v_readfirstlane_b32 s92, v31 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s80, v32 +; SI-NEXT: v_readfirstlane_b32 s93, v32 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s69, v33 +; SI-NEXT: v_readfirstlane_b32 s90, v33 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 @@ -231696,20 +231713,19 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s84, v34 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s68, v35 +; SI-NEXT: v_readfirstlane_b32 s94, v35 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s83, v36 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s87, v38 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 -; SI-NEXT: v_readfirstlane_b32 s6, v37 +; SI-NEXT: v_readfirstlane_b32 s91, v37 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 -; SI-NEXT: v_writelane_b32 v40, s97, 33 -; SI-NEXT: v_readfirstlane_b32 s38, v12 -; SI-NEXT: v_writelane_b32 v41, s37, 22 +; SI-NEXT: v_readfirstlane_b32 s49, v12 +; SI-NEXT: v_writelane_b32 v41, s48, 22 ; SI-NEXT: v_writelane_b32 v40, s98, 34 ; SI-NEXT: v_readfirstlane_b32 s14, v30 ; SI-NEXT: v_readfirstlane_b32 s15, v29 @@ -231719,21 +231735,21 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: v_readfirstlane_b32 s11, v25 ; SI-NEXT: v_readfirstlane_b32 s8, v24 ; SI-NEXT: v_readfirstlane_b32 s9, v23 -; SI-NEXT: v_readfirstlane_b32 s88, v22 -; SI-NEXT: v_readfirstlane_b32 s29, v21 -; SI-NEXT: v_readfirstlane_b32 s79, v20 -; SI-NEXT: v_readfirstlane_b32 s27, v19 -; SI-NEXT: v_readfirstlane_b32 s78, v18 -; SI-NEXT: v_readfirstlane_b32 s25, v17 -; SI-NEXT: v_readfirstlane_b32 s77, v16 -; SI-NEXT: v_readfirstlane_b32 s23, v15 -; SI-NEXT: v_readfirstlane_b32 s39, v14 -; SI-NEXT: v_readfirstlane_b32 s21, v13 -; SI-NEXT: v_readfirstlane_b32 s19, v11 +; SI-NEXT: v_readfirstlane_b32 s89, v22 +; SI-NEXT: v_readfirstlane_b32 s7, v21 +; SI-NEXT: v_readfirstlane_b32 s88, v20 +; SI-NEXT: v_readfirstlane_b32 s29, v19 +; SI-NEXT: v_readfirstlane_b32 s77, v18 +; SI-NEXT: v_readfirstlane_b32 s27, v17 +; SI-NEXT: v_readfirstlane_b32 s75, v16 +; SI-NEXT: v_readfirstlane_b32 s25, v15 +; SI-NEXT: v_readfirstlane_b32 s50, v14 +; SI-NEXT: v_readfirstlane_b32 s23, v13 +; SI-NEXT: v_readfirstlane_b32 s21, v11 ; SI-NEXT: v_readfirstlane_b32 s18, v1 -; SI-NEXT: v_writelane_b32 v41, s38, 23 +; SI-NEXT: v_writelane_b32 v41, s49, 23 ; SI-NEXT: v_writelane_b32 v40, s99, 35 -; SI-NEXT: v_writelane_b32 v41, s39, 24 +; SI-NEXT: v_writelane_b32 v41, s50, 24 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s58, v31 ; SI-NEXT: s_waitcnt vmcnt(11) @@ -231762,165 +231778,166 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB107_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_writelane_b32 v41, s5, 25 +; SI-NEXT: s_lshl_b32 s5, s61, 16 +; SI-NEXT: v_writelane_b32 v41, s5, 26 +; SI-NEXT: s_lshl_b32 s5, s20, 16 +; SI-NEXT: v_writelane_b32 v41, s5, 27 +; SI-NEXT: s_lshl_b32 s5, s22, 16 +; SI-NEXT: v_writelane_b32 v41, s5, 28 +; SI-NEXT: s_lshl_b32 s5, s24, 16 +; SI-NEXT: v_writelane_b32 v41, s5, 29 +; SI-NEXT: s_lshl_b32 s5, s26, 16 +; SI-NEXT: v_writelane_b32 v41, s5, 30 +; SI-NEXT: s_lshl_b32 s5, s28, 16 +; SI-NEXT: v_writelane_b32 v41, s5, 31 +; SI-NEXT: s_lshl_b32 s5, s18, 16 +; SI-NEXT: v_writelane_b32 v41, s5, 32 +; SI-NEXT: s_lshl_b32 s5, s95, 16 +; SI-NEXT: v_writelane_b32 v41, s5, 33 +; SI-NEXT: s_lshl_b32 s5, s38, 16 ; SI-NEXT: s_lshl_b32 s4, s60, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 25 -; SI-NEXT: s_lshl_b32 s4, s63, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 26 -; SI-NEXT: s_lshl_b32 s4, s20, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 27 -; SI-NEXT: s_lshl_b32 s4, s22, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 28 -; SI-NEXT: s_lshl_b32 s4, s24, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 29 -; SI-NEXT: s_lshl_b32 s4, s26, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 30 -; SI-NEXT: s_lshl_b32 s4, s28, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 31 -; SI-NEXT: s_lshl_b32 s4, s18, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 32 -; SI-NEXT: s_lshl_b32 s4, s89, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 33 -; SI-NEXT: s_lshl_b32 s4, s91, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 34 -; SI-NEXT: s_lshl_b32 s4, s35, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 35 -; SI-NEXT: s_lshl_b32 s4, s37, 16 -; SI-NEXT: s_lshl_b32 s7, s17, 16 -; SI-NEXT: s_lshl_b32 s96, s61, 16 -; SI-NEXT: s_lshl_b32 s99, s72, 16 -; SI-NEXT: s_lshl_b32 s97, s74, 16 -; SI-NEXT: s_lshl_b32 s92, s75, 16 -; SI-NEXT: s_lshl_b32 s94, s76, 16 -; SI-NEXT: s_lshl_b32 s95, s93, 16 -; SI-NEXT: s_lshl_b32 s93, s16, 16 -; SI-NEXT: s_lshl_b32 s30, s73, 16 -; SI-NEXT: s_lshl_b32 s31, s90, 16 -; SI-NEXT: s_lshl_b32 s34, s34, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 36 -; SI-NEXT: s_lshl_b32 s35, s36, 16 -; SI-NEXT: s_lshl_b32 s86, s19, 16 -; SI-NEXT: s_lshl_b32 s36, s38, 16 -; SI-NEXT: s_lshl_b32 s22, s21, 16 +; SI-NEXT: v_writelane_b32 v41, s5, 34 +; SI-NEXT: s_lshl_b32 s5, s48, 16 +; SI-NEXT: s_lshl_b32 s16, s19, 16 +; SI-NEXT: s_lshl_b32 s63, s72, 16 +; SI-NEXT: s_lshl_b32 s98, s74, 16 +; SI-NEXT: s_lshl_b32 s62, s76, 16 +; SI-NEXT: s_lshl_b32 s96, s78, 16 +; SI-NEXT: s_lshl_b32 s31, s79, 16 +; SI-NEXT: s_lshl_b32 s34, s6, 16 +; SI-NEXT: s_lshl_b32 s35, s73, 16 +; SI-NEXT: s_lshl_b32 s30, s30, 16 +; SI-NEXT: s_lshl_b32 s36, s36, 16 +; SI-NEXT: s_lshl_b32 s99, s37, 16 +; SI-NEXT: v_writelane_b32 v41, s5, 35 ; SI-NEXT: s_lshl_b32 s37, s39, 16 -; SI-NEXT: s_lshl_b32 s24, s23, 16 -; SI-NEXT: s_lshl_b32 s38, s77, 16 -; SI-NEXT: s_lshl_b32 s28, s25, 16 -; SI-NEXT: s_lshl_b32 s39, s78, 16 -; SI-NEXT: s_lshl_b32 s61, s27, 16 -; SI-NEXT: s_lshl_b32 s48, s79, 16 -; SI-NEXT: s_lshl_b32 s89, s29, 16 -; SI-NEXT: s_lshl_b32 s49, s88, 16 -; SI-NEXT: s_lshl_b32 s60, s9, 16 -; SI-NEXT: s_lshl_b32 s50, s8, 16 -; SI-NEXT: s_lshl_b32 s90, s11, 16 -; SI-NEXT: s_lshl_b32 s91, s10, 16 -; SI-NEXT: s_lshl_b32 s70, s13, 16 -; SI-NEXT: s_lshl_b32 s51, s12, 16 -; SI-NEXT: s_lshl_b32 s71, s15, 16 -; SI-NEXT: s_lshl_b32 s52, s14, 16 -; SI-NEXT: s_lshl_b32 s20, s41, 16 -; SI-NEXT: s_lshl_b32 s53, s40, 16 -; SI-NEXT: s_lshl_b32 s81, s43, 16 -; SI-NEXT: s_lshl_b32 s54, s42, 16 -; SI-NEXT: s_lshl_b32 s63, s45, 16 -; SI-NEXT: s_lshl_b32 s55, s44, 16 -; SI-NEXT: s_lshl_b32 s72, s47, 16 -; SI-NEXT: s_lshl_b32 s64, s46, 16 -; SI-NEXT: s_lshl_b32 s82, s57, 16 -; SI-NEXT: s_lshl_b32 s65, s56, 16 -; SI-NEXT: s_lshl_b32 s74, s59, 16 -; SI-NEXT: s_lshl_b32 s66, s58, 16 -; SI-NEXT: s_lshl_b32 s75, s87, 16 -; SI-NEXT: s_mov_b32 s73, s6 -; SI-NEXT: s_lshl_b32 s67, s6, 16 -; SI-NEXT: s_lshl_b32 s76, s83, 16 -; SI-NEXT: s_mov_b32 s16, s68 -; SI-NEXT: s_lshl_b32 s68, s68, 16 -; SI-NEXT: s_lshl_b32 s85, s84, 16 -; SI-NEXT: s_mov_b32 s98, s69 -; SI-NEXT: s_lshl_b32 s69, s69, 16 -; SI-NEXT: s_lshl_b32 s17, s80, 16 -; SI-NEXT: s_mov_b32 s6, s62 -; SI-NEXT: s_lshl_b32 s26, s62, 16 +; SI-NEXT: s_lshl_b32 s19, s21, 16 +; SI-NEXT: s_lshl_b32 s38, s49, 16 +; SI-NEXT: s_lshl_b32 s20, s23, 16 +; SI-NEXT: s_lshl_b32 s39, s50, 16 +; SI-NEXT: s_lshl_b32 s22, s25, 16 +; SI-NEXT: s_lshl_b32 s48, s75, 16 +; SI-NEXT: s_lshl_b32 s60, s27, 16 +; SI-NEXT: s_lshl_b32 s49, s77, 16 +; SI-NEXT: s_lshl_b32 s24, s29, 16 +; SI-NEXT: s_lshl_b32 s50, s88, 16 +; SI-NEXT: s_lshl_b32 s61, s7, 16 +; SI-NEXT: s_lshl_b32 s51, s89, 16 +; SI-NEXT: s_lshl_b32 s28, s9, 16 +; SI-NEXT: s_lshl_b32 s52, s8, 16 +; SI-NEXT: s_lshl_b32 s72, s11, 16 +; SI-NEXT: s_lshl_b32 s53, s10, 16 +; SI-NEXT: s_lshl_b32 s74, s13, 16 +; SI-NEXT: s_lshl_b32 s54, s12, 16 +; SI-NEXT: s_lshl_b32 s95, s15, 16 +; SI-NEXT: s_lshl_b32 s55, s14, 16 +; SI-NEXT: s_lshl_b32 s81, s41, 16 +; SI-NEXT: s_lshl_b32 s64, s40, 16 +; SI-NEXT: s_lshl_b32 s82, s43, 16 +; SI-NEXT: s_lshl_b32 s65, s42, 16 +; SI-NEXT: s_lshl_b32 s85, s45, 16 +; SI-NEXT: s_lshl_b32 s66, s44, 16 +; SI-NEXT: s_lshl_b32 s86, s47, 16 +; SI-NEXT: s_lshl_b32 s67, s46, 16 +; SI-NEXT: s_lshl_b32 s76, s57, 16 +; SI-NEXT: s_lshl_b32 s68, s56, 16 +; SI-NEXT: s_lshl_b32 s97, s59, 16 +; SI-NEXT: s_lshl_b32 s69, s58, 16 +; SI-NEXT: s_lshl_b32 s78, s87, 16 +; SI-NEXT: s_mov_b32 s6, s91 +; SI-NEXT: s_lshl_b32 s70, s91, 16 +; SI-NEXT: s_lshl_b32 s79, s83, 16 +; SI-NEXT: s_mov_b32 s73, s94 +; SI-NEXT: s_lshl_b32 s71, s94, 16 +; SI-NEXT: s_lshl_b32 s26, s84, 16 +; SI-NEXT: s_mov_b32 s91, s90 +; SI-NEXT: s_lshl_b32 s90, s90, 16 +; SI-NEXT: s_mov_b32 s94, s93 +; SI-NEXT: s_lshl_b32 s17, s93, 16 +; SI-NEXT: s_mov_b32 s93, s92 +; SI-NEXT: s_lshl_b32 s80, s92, 16 +; SI-NEXT: s_mov_b32 s92, s4 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB107_3 ; SI-NEXT: .LBB107_2: ; SI-NEXT: ; implicit-def: $sgpr17 ; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: s_mov_b32 s16, s68 +; SI-NEXT: s_mov_b32 s73, s94 ; SI-NEXT: ; implicit-def: $sgpr17 ; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: s_mov_b32 s73, s6 +; SI-NEXT: s_mov_b32 s6, s91 ; SI-NEXT: ; implicit-def: $sgpr17 ; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: s_mov_b32 s6, s62 +; SI-NEXT: s_mov_b32 s94, s93 ; SI-NEXT: ; implicit-def: $sgpr17 ; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: s_mov_b32 s98, s69 +; SI-NEXT: s_mov_b32 s93, s92 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: s_mov_b32 s91, s90 ; SI-NEXT: ; implicit-def: $sgpr17 ; SI-NEXT: ; kill: killed $sgpr17 ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $sgpr17 ; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $sgpr96 -; SI-NEXT: ; implicit-def: $sgpr99 -; SI-NEXT: ; implicit-def: $sgpr97 +; SI-NEXT: ; implicit-def: $sgpr16 ; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr95 -; SI-NEXT: ; implicit-def: $sgpr93 -; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; kill: killed $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr96 ; SI-NEXT: ; implicit-def: $sgpr31 ; SI-NEXT: ; implicit-def: $sgpr34 ; SI-NEXT: ; implicit-def: $sgpr35 -; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr30 ; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr99 ; SI-NEXT: ; implicit-def: $sgpr37 -; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr19 ; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr20 ; SI-NEXT: ; implicit-def: $sgpr39 -; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr22 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr89 -; SI-NEXT: ; implicit-def: $sgpr49 ; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr91 -; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr61 ; SI-NEXT: ; implicit-def: $sgpr51 -; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr53 -; SI-NEXT: ; implicit-def: $sgpr81 +; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr95 ; SI-NEXT: ; implicit-def: $sgpr55 -; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr81 ; SI-NEXT: ; implicit-def: $sgpr64 ; SI-NEXT: ; implicit-def: $sgpr82 ; SI-NEXT: ; implicit-def: $sgpr65 -; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr85 ; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr86 ; SI-NEXT: ; implicit-def: $sgpr67 ; SI-NEXT: ; implicit-def: $sgpr76 ; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr85 +; SI-NEXT: ; implicit-def: $sgpr97 ; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr71 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr80 ; SI-NEXT: ; implicit-def: $sgpr17 ; SI-NEXT: ; kill: killed $sgpr17 ; SI-NEXT: ; implicit-def: $sgpr17 @@ -231928,86 +231945,97 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: ; implicit-def: $sgpr17 ; SI-NEXT: .LBB107_3: ; %Flow ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_mov_b32 s5, s17 -; SI-NEXT: s_mov_b32 s17, s86 -; SI-NEXT: s_mov_b32 s86, s7 +; SI-NEXT: s_mov_b32 s4, s17 +; SI-NEXT: s_mov_b32 s17, s30 +; SI-NEXT: s_mov_b32 s30, s99 +; SI-NEXT: s_mov_b32 s99, s16 ; SI-NEXT: s_cbranch_vccnz .LBB107_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_add_i32 s4, s94, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s93, 16 +; SI-NEXT: s_add_i32 s84, s84, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s84, 0xffff +; SI-NEXT: s_lshl_b32 s60, s91, 16 +; SI-NEXT: s_add_i32 s83, s83, 3 +; SI-NEXT: s_or_b32 s5, s60, s5 +; SI-NEXT: s_and_b32 s60, s83, 0xffff +; SI-NEXT: s_lshl_b32 s61, s73, 16 +; SI-NEXT: s_or_b32 s79, s61, s60 +; SI-NEXT: s_lshl_b32 s61, s6, 16 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: v_readlane_b32 s6, v41, 24 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_lshl_b32 s20, s6, 16 ; SI-NEXT: v_readlane_b32 s6, v41, 23 -; SI-NEXT: s_lshl_b32 s17, s6, 16 -; SI-NEXT: v_readlane_b32 s6, v41, 22 -; SI-NEXT: s_lshl_b32 s61, s16, 16 -; SI-NEXT: s_add_i32 s16, s6, 3 -; SI-NEXT: v_readlane_b32 s6, v41, 21 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s7, s6, 16 -; SI-NEXT: v_readlane_b32 s6, v41, 20 -; SI-NEXT: s_or_b32 s7, s7, s16 -; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: v_readlane_b32 s16, v41, 19 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_and_b32 s19, s19, 0xffff -; SI-NEXT: s_or_b32 s6, s16, s6 -; SI-NEXT: v_readlane_b32 s16, v41, 18 -; SI-NEXT: s_lshl_b32 s60, s98, 16 -; SI-NEXT: s_or_b32 s17, s17, s19 -; SI-NEXT: s_add_i32 s98, s16, 3 -; SI-NEXT: v_readlane_b32 s19, v41, 17 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_and_b32 s16, s98, 0xffff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_and_b32 s21, s21, 0xffff -; SI-NEXT: s_or_b32 s16, s19, s16 -; SI-NEXT: v_readlane_b32 s19, v41, 16 ; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: s_and_b32 s11, s11, 0xffff ; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: s_and_b32 s9, s9, 0xffff -; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s9, s89, 16 ; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_or_b32 s20, s20, s21 -; SI-NEXT: s_add_i32 s96, s19, 3 -; SI-NEXT: v_readlane_b32 s21, v41, 15 +; SI-NEXT: s_lshl_b32 s19, s6, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 22 ; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_and_b32 s13, s13, 0xffff ; SI-NEXT: s_lshl_b32 s12, s12, 16 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_or_b32 s7, s9, s7 ; SI-NEXT: s_and_b32 s9, s29, 0xffff ; SI-NEXT: s_lshl_b32 s11, s88, 16 ; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_and_b32 s19, s96, 0xffff -; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_add_i32 s16, s6, 3 +; SI-NEXT: v_readlane_b32 s6, v41, 21 ; SI-NEXT: s_and_b32 s15, s15, 0xffff ; SI-NEXT: s_lshl_b32 s14, s14, 16 ; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: s_or_b32 s9, s11, s9 ; SI-NEXT: s_and_b32 s11, s27, 0xffff -; SI-NEXT: s_lshl_b32 s13, s79, 16 +; SI-NEXT: s_lshl_b32 s13, s77, 16 ; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_or_b32 s19, s21, s19 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_readlane_b32 s21, v41, 14 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s6, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 20 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: s_or_b32 s11, s13, s11 ; SI-NEXT: s_and_b32 s13, s25, 0xffff -; SI-NEXT: s_lshl_b32 s15, s78, 16 +; SI-NEXT: s_lshl_b32 s15, s75, 16 ; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_readlane_b32 s17, v41, 19 ; SI-NEXT: s_or_b32 s13, s15, s13 ; SI-NEXT: s_and_b32 s15, s23, 0xffff -; SI-NEXT: s_lshl_b32 s22, s77, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s15, s20, s15 +; SI-NEXT: s_and_b32 s20, s21, 0xffff +; SI-NEXT: s_or_b32 s6, s17, s6 +; SI-NEXT: v_readlane_b32 s17, v41, 18 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_add_i32 s98, s17, 3 +; SI-NEXT: v_readlane_b32 s20, v41, 17 +; SI-NEXT: s_and_b32 s17, s98, 0xffff +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_or_b32 s17, s20, s17 +; SI-NEXT: v_readlane_b32 s20, v41, 16 +; SI-NEXT: s_add_i32 s96, s20, 3 +; SI-NEXT: v_readlane_b32 s21, v41, 15 +; SI-NEXT: s_and_b32 s20, s96, 0xffff +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_readlane_b32 s21, v41, 14 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s21, s21, 16 ; SI-NEXT: s_or_b32 s18, s21, s18 ; SI-NEXT: v_readlane_b32 s21, v41, 13 -; SI-NEXT: s_or_b32 s15, s22, s15 ; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: v_readlane_b32 s22, v41, 12 ; SI-NEXT: s_and_b32 s21, s21, 0xffff @@ -232051,40 +232079,27 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_or_b32 s27, s28, s27 ; SI-NEXT: s_add_i32 s27, s27, 0x30000 ; SI-NEXT: s_add_i32 s26, s26, 0x30000 -; SI-NEXT: s_and_b32 s86, s27, 0xffff0000 -; SI-NEXT: s_lshl_b32 s27, s27, 16 +; SI-NEXT: s_and_b32 s28, s27, 0xffff0000 ; SI-NEXT: s_add_i32 s25, s25, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s27, 25 -; SI-NEXT: s_and_b32 s96, s26, 0xffff0000 +; SI-NEXT: v_writelane_b32 v41, s28, 25 +; SI-NEXT: s_and_b32 s99, s26, 0xffff0000 ; SI-NEXT: s_lshl_b32 s26, s26, 16 ; SI-NEXT: s_add_i32 s24, s24, 0x30000 ; SI-NEXT: v_writelane_b32 v41, s26, 26 -; SI-NEXT: s_and_b32 s99, s25, 0xffff0000 +; SI-NEXT: s_and_b32 s63, s25, 0xffff0000 ; SI-NEXT: s_lshl_b32 s25, s25, 16 ; SI-NEXT: s_add_i32 s23, s23, 0x30000 ; SI-NEXT: v_writelane_b32 v41, s25, 27 -; SI-NEXT: s_and_b32 s97, s24, 0xffff0000 +; SI-NEXT: s_and_b32 s98, s24, 0xffff0000 ; SI-NEXT: s_lshl_b32 s24, s24, 16 -; SI-NEXT: s_add_i32 s80, s80, 3 ; SI-NEXT: s_add_i32 s22, s22, 0x30000 ; SI-NEXT: v_writelane_b32 v41, s24, 28 -; SI-NEXT: s_and_b32 s92, s23, 0xffff0000 +; SI-NEXT: s_and_b32 s62, s23, 0xffff0000 ; SI-NEXT: s_lshl_b32 s23, s23, 16 -; SI-NEXT: s_and_b32 s4, s80, 0xffff -; SI-NEXT: s_add_i32 s84, s84, 3 ; SI-NEXT: s_add_i32 s21, s21, 0x30000 ; SI-NEXT: v_writelane_b32 v41, s23, 29 -; SI-NEXT: s_and_b32 s94, s22, 0xffff0000 +; SI-NEXT: s_and_b32 s96, s22, 0xffff0000 ; SI-NEXT: s_lshl_b32 s22, s22, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s84, 0xffff -; SI-NEXT: s_add_i32 s83, s83, 3 -; SI-NEXT: s_add_i32 s18, s18, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s22, 30 -; SI-NEXT: s_and_b32 s95, s21, 0xffff0000 -; SI-NEXT: s_lshl_b32 s21, s21, 16 -; SI-NEXT: s_or_b32 s5, s60, s5 -; SI-NEXT: s_and_b32 s60, s83, 0xffff ; SI-NEXT: s_add_i32 s87, s87, 3 ; SI-NEXT: s_add_i32 s59, s59, 3 ; SI-NEXT: s_add_i32 s57, s57, 3 @@ -232092,13 +232107,11 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_add_i32 s45, s45, 3 ; SI-NEXT: s_add_i32 s43, s43, 3 ; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s19, s19, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s21, 31 -; SI-NEXT: s_and_b32 s93, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_or_b32 s76, s61, s60 +; SI-NEXT: s_add_i32 s18, s18, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s22, 30 +; SI-NEXT: s_and_b32 s31, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s21, s21, 16 ; SI-NEXT: s_and_b32 s60, s87, 0xffff -; SI-NEXT: s_lshl_b32 s61, s73, 16 ; SI-NEXT: s_and_b32 s59, s59, 0xffff ; SI-NEXT: s_lshl_b32 s58, s58, 16 ; SI-NEXT: s_and_b32 s57, s57, 0xffff @@ -232111,10 +232124,11 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_lshl_b32 s42, s42, 16 ; SI-NEXT: s_and_b32 s41, s41, 0xffff ; SI-NEXT: s_lshl_b32 s40, s40, 16 -; SI-NEXT: s_add_i32 s16, s16, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s18, 32 -; SI-NEXT: s_lshl_b32 s18, s19, 16 -; SI-NEXT: s_or_b32 s75, s61, s60 +; SI-NEXT: s_add_i32 s20, s20, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s21, 31 +; SI-NEXT: s_and_b32 s34, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_or_b32 s78, s61, s60 ; SI-NEXT: s_or_b32 s58, s58, s59 ; SI-NEXT: s_or_b32 s56, s56, s57 ; SI-NEXT: s_or_b32 s46, s46, s47 @@ -232122,13 +232136,12 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_or_b32 s42, s42, s43 ; SI-NEXT: s_or_b32 s40, s40, s41 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s18, 33 -; SI-NEXT: s_and_b32 s31, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: v_writelane_b32 v41, s18, 32 +; SI-NEXT: s_lshl_b32 s18, s20, 16 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s76, s76, 0x30000 -; SI-NEXT: s_add_i32 s75, s75, 0x30000 +; SI-NEXT: s_add_i32 s79, s79, 0x30000 +; SI-NEXT: s_add_i32 s78, s78, 0x30000 ; SI-NEXT: s_add_i32 s58, s58, 0x30000 ; SI-NEXT: s_add_i32 s56, s56, 0x30000 ; SI-NEXT: s_add_i32 s46, s46, 0x30000 @@ -232139,293 +232152,296 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_add_i32 s12, s12, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 ; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s11, s11, 0x30000 ; SI-NEXT: s_add_i32 s13, s13, 0x30000 ; SI-NEXT: s_add_i32 s15, s15, 0x30000 -; SI-NEXT: s_add_i32 s20, s20, 0x30000 +; SI-NEXT: s_add_i32 s19, s19, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 ; SI-NEXT: s_add_i32 s17, s17, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s16, 34 -; SI-NEXT: s_and_b32 s34, s6, 0xffff0000 +; SI-NEXT: v_writelane_b32 v41, s18, 33 +; SI-NEXT: s_and_b32 s30, s6, 0xffff0000 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s30, s19, 0xffff0000 -; SI-NEXT: v_writelane_b32 v41, s6, 35 -; SI-NEXT: s_and_b32 s35, s7, 0xffff0000 -; SI-NEXT: s_lshl_b32 s6, s7, 16 +; SI-NEXT: s_lshl_b32 s92, s27, 16 +; SI-NEXT: s_and_b32 s35, s20, 0xffff0000 ; SI-NEXT: s_and_b32 s36, s17, 0xffff0000 ; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_and_b32 s37, s20, 0xffff0000 -; SI-NEXT: s_lshl_b32 s22, s20, 16 -; SI-NEXT: s_and_b32 s38, s15, 0xffff0000 -; SI-NEXT: s_lshl_b32 s24, s15, 16 -; SI-NEXT: s_and_b32 s39, s13, 0xffff0000 -; SI-NEXT: s_lshl_b32 s28, s13, 16 -; SI-NEXT: s_and_b32 s48, s11, 0xffff0000 -; SI-NEXT: s_lshl_b32 s61, s11, 16 -; SI-NEXT: s_and_b32 s49, s9, 0xffff0000 -; SI-NEXT: s_lshl_b32 s89, s9, 16 -; SI-NEXT: s_and_b32 s50, s8, 0xffff0000 -; SI-NEXT: s_lshl_b32 s60, s8, 16 -; SI-NEXT: s_and_b32 s91, s10, 0xffff0000 -; SI-NEXT: s_lshl_b32 s90, s10, 16 -; SI-NEXT: s_and_b32 s51, s12, 0xffff0000 -; SI-NEXT: s_lshl_b32 s70, s12, 16 -; SI-NEXT: s_and_b32 s52, s14, 0xffff0000 -; SI-NEXT: s_lshl_b32 s71, s14, 16 -; SI-NEXT: s_and_b32 s53, s40, 0xffff0000 -; SI-NEXT: s_lshl_b32 s20, s40, 16 -; SI-NEXT: s_and_b32 s54, s42, 0xffff0000 -; SI-NEXT: s_lshl_b32 s81, s42, 16 -; SI-NEXT: s_and_b32 s55, s44, 0xffff0000 -; SI-NEXT: s_lshl_b32 s63, s44, 16 -; SI-NEXT: s_and_b32 s64, s46, 0xffff0000 -; SI-NEXT: s_lshl_b32 s72, s46, 16 -; SI-NEXT: s_and_b32 s65, s56, 0xffff0000 -; SI-NEXT: s_lshl_b32 s82, s56, 16 -; SI-NEXT: s_and_b32 s66, s58, 0xffff0000 -; SI-NEXT: s_lshl_b32 s74, s58, 16 -; SI-NEXT: s_and_b32 s67, s75, 0xffff0000 -; SI-NEXT: s_lshl_b32 s75, s75, 16 -; SI-NEXT: s_and_b32 s68, s76, 0xffff0000 -; SI-NEXT: s_lshl_b32 s76, s76, 16 -; SI-NEXT: s_and_b32 s69, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s85, s5, 16 -; SI-NEXT: s_and_b32 s26, s4, 0xffff0000 -; SI-NEXT: s_lshl_b32 s5, s4, 16 -; SI-NEXT: v_writelane_b32 v41, s6, 36 +; SI-NEXT: v_writelane_b32 v41, s6, 34 +; SI-NEXT: s_and_b32 s37, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s16, 16 +; SI-NEXT: s_and_b32 s38, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_and_b32 s39, s15, 0xffff0000 +; SI-NEXT: s_lshl_b32 s20, s15, 16 +; SI-NEXT: s_and_b32 s48, s13, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s13, 16 +; SI-NEXT: s_and_b32 s49, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s60, s11, 16 +; SI-NEXT: s_and_b32 s50, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s9, 16 +; SI-NEXT: s_and_b32 s51, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s61, s7, 16 +; SI-NEXT: s_and_b32 s52, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s28, s8, 16 +; SI-NEXT: s_and_b32 s53, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s72, s10, 16 +; SI-NEXT: s_and_b32 s54, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s74, s12, 16 +; SI-NEXT: s_and_b32 s55, s14, 0xffff0000 +; SI-NEXT: s_lshl_b32 s95, s14, 16 +; SI-NEXT: s_and_b32 s64, s40, 0xffff0000 +; SI-NEXT: s_lshl_b32 s81, s40, 16 +; SI-NEXT: s_and_b32 s65, s42, 0xffff0000 +; SI-NEXT: s_lshl_b32 s82, s42, 16 +; SI-NEXT: s_and_b32 s66, s44, 0xffff0000 +; SI-NEXT: s_lshl_b32 s85, s44, 16 +; SI-NEXT: s_and_b32 s67, s46, 0xffff0000 +; SI-NEXT: s_lshl_b32 s86, s46, 16 +; SI-NEXT: s_and_b32 s68, s56, 0xffff0000 +; SI-NEXT: s_lshl_b32 s76, s56, 16 +; SI-NEXT: s_and_b32 s69, s58, 0xffff0000 +; SI-NEXT: s_lshl_b32 s97, s58, 16 +; SI-NEXT: s_and_b32 s70, s78, 0xffff0000 +; SI-NEXT: s_lshl_b32 s78, s78, 16 +; SI-NEXT: s_and_b32 s71, s79, 0xffff0000 +; SI-NEXT: s_lshl_b32 s79, s79, 16 +; SI-NEXT: s_and_b32 s90, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s26, s5, 16 +; SI-NEXT: s_and_b32 s80, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_writelane_b32 v41, s6, 35 ; SI-NEXT: .LBB107_5: ; %end -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s86 -; SI-NEXT: v_readlane_b32 s4, v41, 25 +; SI-NEXT: v_readlane_b32 s5, v41, 25 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s5 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s92 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s96 -; SI-NEXT: v_readlane_b32 s4, v41, 26 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s99 +; SI-NEXT: v_readlane_b32 s5, v41, 26 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s99 -; SI-NEXT: v_readlane_b32 s4, v41, 27 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 +; SI-NEXT: v_readlane_b32 s5, v41, 27 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s97 -; SI-NEXT: v_readlane_b32 s4, v41, 28 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s98 +; SI-NEXT: v_readlane_b32 s5, v41, 28 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 -; SI-NEXT: v_readlane_b32 s4, v41, 29 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62 +; SI-NEXT: v_readlane_b32 s5, v41, 29 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 -; SI-NEXT: v_readlane_b32 s4, v41, 30 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s96 +; SI-NEXT: v_readlane_b32 s5, v41, 30 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 -; SI-NEXT: v_readlane_b32 s4, v41, 31 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 +; SI-NEXT: v_readlane_b32 s5, v41, 31 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 -; SI-NEXT: v_readlane_b32 s4, v41, 32 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34 +; SI-NEXT: v_readlane_b32 s5, v41, 32 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30 -; SI-NEXT: v_readlane_b32 s4, v41, 33 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 +; SI-NEXT: v_readlane_b32 s5, v41, 33 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 -; SI-NEXT: v_readlane_b32 s4, v41, 34 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s36 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34 -; SI-NEXT: v_readlane_b32 s4, v41, 35 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30 +; SI-NEXT: v_readlane_b32 s5, v41, 34 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 -; SI-NEXT: v_readlane_b32 s4, v41, 36 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 +; SI-NEXT: v_readlane_b32 s5, v41, 35 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s36 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s38 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s20 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s38 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s48 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s22 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s60 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s48 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s50 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s61 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s24 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s89 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s61 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s50 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s60 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s28 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s90 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s72 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s70 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s71 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s95 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s81 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s81 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s82 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s63 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s85 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s72 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s86 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s82 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s97 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s70 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s75 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s78 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s71 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s79 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s85 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s26 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s80 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -232930,94 +232946,94 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:88 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v22 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v53, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v12, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v29 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v49 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v47, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v51 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v47, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v41 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v56 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v59 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v49, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v60 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v50, v37 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v58, v32 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v33, v36 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v37 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:104 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v62 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v45, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v39 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 ; SI-NEXT: s_waitcnt vmcnt(5) @@ -233026,15 +233042,15 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v41, v8 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f16_f32_e32 v30, v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v32 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v37 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:124 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:132 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v32 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v46 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v46, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -233047,17 +233063,18 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v59, v29 ; SI-NEXT: v_mov_b32_e32 v29, v27 ; SI-NEXT: v_mov_b32_e32 v57, v23 -; SI-NEXT: v_mov_b32_e32 v60, v3 -; SI-NEXT: v_mov_b32_e32 v62, v4 -; SI-NEXT: v_mov_b32_e32 v63, v49 -; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v60, v4 +; SI-NEXT: v_mov_b32_e32 v62, v13 ; SI-NEXT: s_xor_b64 exec, exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB108_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 @@ -233087,17 +233104,20 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v14, v14, v37 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v36 ; SI-NEXT: v_or_b32_e32 v33, v33, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 @@ -233107,111 +233127,106 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v56, v37, v39 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_or_b32_e32 v11, v11, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v4 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_or_b32_e32 v61, v3, v37 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v58 -; SI-NEXT: v_or_b32_e32 v11, v11, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v17 ; SI-NEXT: v_or_b32_e32 v16, v16, v37 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_or_b32_e32 v21, v21, v37 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_or_b32_e32 v24, v24, v37 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v31, v31, v37 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v20 ; SI-NEXT: v_or_b32_e32 v19, v19, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1 ; SI-NEXT: v_or_b32_e32 v18, v18, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v37 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v53, v37 ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_or_b32_e32 v52, v37, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v37 ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v55, v37, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v37 ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_or_b32_e32 v43, v37, v39 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v37, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v39, v63 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 @@ -233219,27 +233234,22 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47 ; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v37, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 ; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 ; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 ; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 @@ -233250,125 +233260,131 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: v_or_b32_e32 v38, v38, v47 ; SI-NEXT: v_or_b32_e32 v54, v54, v42 +; SI-NEXT: v_or_b32_e32 v49, v49, v51 ; SI-NEXT: v_or_b32_e32 v45, v45, v50 ; SI-NEXT: v_or_b32_e32 v41, v41, v30 ; SI-NEXT: v_or_b32_e32 v46, v46, v32 -; SI-NEXT: v_alignbit_b32 v47, v16, v47, 16 ; SI-NEXT: v_alignbit_b32 v42, v11, v42, 16 +; SI-NEXT: v_alignbit_b32 v51, v56, v51, 16 ; SI-NEXT: v_alignbit_b32 v50, v14, v50, 16 ; SI-NEXT: v_alignbit_b32 v30, v7, v30, 16 ; SI-NEXT: v_alignbit_b32 v32, v6, v32, 16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 -; SI-NEXT: v_or_b32_e32 v3, v37, v34 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v48, v4 +; SI-NEXT: v_or_b32_e32 v4, v37, v34 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v39 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_alignbit_b32 v34, v43, v34, 16 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_alignbit_b32 v63, v55, v37, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v3, v39, v1 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v4, v48, v37 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v1, v55, v1, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v37, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 -; SI-NEXT: v_or_b32_e32 v3, v37, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v37, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_alignbit_b32 v5, v52, v5, 16 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v48, v4 +; SI-NEXT: v_or_b32_e32 v4, v39, v5 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v3, v39, v9 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v62 +; SI-NEXT: v_alignbit_b32 v5, v52, v5, 16 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v4 +; SI-NEXT: v_or_b32_e32 v4, v39, v9 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; SI-NEXT: v_or_b32_e32 v62, v56, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v60 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_alignbit_b32 v9, v2, v9, 16 -; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v62, v48, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v60 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; SI-NEXT: v_or_b32_e32 v60, v56, v39 -; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: v_or_b32_e32 v57, v56, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v59 +; SI-NEXT: v_or_b32_e32 v60, v48, v39 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_or_b32_e32 v57, v48, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v59 ; SI-NEXT: v_alignbit_b32 v26, v31, v26, 16 -; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v29, v29, v23 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v27, v4 +; SI-NEXT: v_alignbit_b32 v4, v18, v13, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v4, v19, v39, 16 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v59, v56, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v63 -; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: v_or_b32_e32 v63, v56, v35 -; SI-NEXT: v_alignbit_b32 v35, v33, v35, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, v3 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_or_b32_e32 v3, v49, v51 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v19, v39, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v49, v18, v37, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v4, v24, v23, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v59, v48, v27 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v58 +; SI-NEXT: v_or_b32_e32 v38, v38, v48 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v21, v27, 16 -; SI-NEXT: v_alignbit_b32 v51, v61, v51, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v4, v21, v27, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: v_or_b32_e32 v58, v47, v35 +; SI-NEXT: v_alignbit_b32 v47, v16, v48, 16 +; SI-NEXT: v_alignbit_b32 v35, v33, v35, 16 ; SI-NEXT: .LBB108_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v37, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v37, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v34, v37, v34 ; SI-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -233377,58 +233393,52 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v34, v34, v37 ; SI-NEXT: v_add_i32_e32 v37, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v63 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_and_b32_e32 v34, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v34, v1 -; SI-NEXT: v_add_i32_e32 v34, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v55 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v34 -; SI-NEXT: v_add_i32_e32 v34, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v34, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v34, v34, v37 +; SI-NEXT: v_add_i32_e32 v37, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v52 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v34, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v40 +; SI-NEXT: v_or_b32_e32 v34, v34, v37 +; SI-NEXT: v_add_i32_e32 v37, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v34, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v34, v5 +; SI-NEXT: v_add_i32_e32 v34, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v5, v34, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v53 +; SI-NEXT: v_or_b32_e32 v5, v5, v34 +; SI-NEXT: v_add_i32_e32 v34, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v5, v34, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v62 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -233454,7 +233464,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -233468,7 +233478,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -233502,28 +233512,24 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 @@ -233878,29 +233884,27 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v49, v5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v36, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v24 @@ -233919,19 +233923,22 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v31, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v46 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v56 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v63 ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v34, v37 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f16_f32_e32 v21, v48 @@ -234111,9 +234118,9 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v45 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v60, v48, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v62 ; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 @@ -234121,153 +234128,153 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v54, v56 ; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 ; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_or_b32_e32 v56, v54, v48 ; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_or_b32_e32 v45, v40, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v7 -; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; SI-NEXT: v_or_b32_e32 v7, v41, v55 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_or_b32_e32 v41, v41, v55 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_alignbit_b32 v63, v46, v51, 16 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v45 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v45, v40, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v7, v7, v40 ; SI-NEXT: v_or_b32_e32 v14, v14, v13 ; SI-NEXT: v_or_b32_e32 v23, v23, v17 ; SI-NEXT: v_or_b32_e32 v34, v34, v21 ; SI-NEXT: v_alignbit_b32 v4, v57, v4, 16 -; SI-NEXT: v_alignbit_b32 v63, v46, v51, 16 ; SI-NEXT: v_alignbit_b32 v62, v29, v48, 16 ; SI-NEXT: v_alignbit_b32 v61, v52, v54, 16 ; SI-NEXT: v_alignbit_b32 v44, v49, v55, 16 +; SI-NEXT: v_alignbit_b32 v43, v38, v40, 16 ; SI-NEXT: v_alignbit_b32 v13, v32, v13, 16 ; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 ; SI-NEXT: v_alignbit_b32 v21, v2, v21, 16 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_or_b32_e32 v7, v7, v40 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v7, v41, v10 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v41, v41, v10 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v10, v35, v10, 16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 ; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_or_b32_e32 v7, v41, v20 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v41, v41, v20 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v20, v31, v20, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 ; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_or_b32_e32 v7, v41, v28 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v41, v41, v28 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v28, v15, v28, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 ; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v7 -; SI-NEXT: v_or_b32_e32 v7, v41, v27 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v27, v11, v27, 16 +; SI-NEXT: v_or_b32_e32 v41, v41, v27 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v43 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v7, v41, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v27, v11, v27, 16 +; SI-NEXT: v_or_b32_e32 v41, v41, v26 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v26, v8, v26, 16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_or_b32_e32 v43, v42, v24 -; SI-NEXT: v_alignbit_b32 v26, v8, v26, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v7, v41, v37 -; SI-NEXT: v_mov_b32_e32 v51, v7 -; SI-NEXT: v_alignbit_b32 v7, v38, v40, 16 +; SI-NEXT: v_or_b32_e32 v41, v42, v24 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 ; SI-NEXT: v_alignbit_b32 v24, v5, v24, 16 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_or_b32_e32 v1, v41, v37 +; SI-NEXT: v_mov_b32_e32 v51, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v37, v1, v37, 16 ; SI-NEXT: .LBB109_3: ; %end ; SI-NEXT: v_and_b32_e32 v48, 0xffff, v60 @@ -234319,7 +234326,6 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -234337,9 +234343,8 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_i32_e32 v29, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v4, v29, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v43 ; SI-NEXT: v_or_b32_e32 v4, v4, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen @@ -234350,7 +234355,7 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_i32_e32 v7, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 @@ -234376,7 +234381,7 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 @@ -234402,7 +234407,7 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 @@ -234416,7 +234421,7 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 @@ -234430,7 +234435,7 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 @@ -234444,8 +234449,10 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v43 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v4, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen @@ -236148,10 +236155,9 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 @@ -236171,13 +236177,13 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; SI-NEXT: v_mov_b32_e32 v42, v4 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill @@ -236188,15 +236194,14 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v41 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec @@ -236205,35 +236210,35 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v41, s16 ; SI-NEXT: v_cvt_f32_f16_e32 v44, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s27 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v41, s17 -; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s29 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v41, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s27 +; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v41, s19 -; SI-NEXT: v_mov_b32_e32 v2, v9 +; SI-NEXT: v_mov_b32_e32 v2, v5 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v41, s20 -; SI-NEXT: v_mov_b32_e32 v3, v10 +; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 @@ -236268,56 +236273,56 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v41, s25 -; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: v_mov_b32_e32 v62, v28 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v41, s28 -; SI-NEXT: v_mov_b32_e32 v61, v30 +; SI-NEXT: v_mov_b32_e32 v63, v29 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 +; SI-NEXT: v_mov_b32_e32 v60, v30 +; SI-NEXT: v_mov_b32_e32 v61, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v62 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_mov_b32_e32 v32, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v46 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 +; SI-NEXT: v_mov_b32_e32 v31, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v59 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 ; SI-NEXT: v_mov_b32_e32 v25, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v52 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v50 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v19 @@ -236328,11 +236333,16 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 +; SI-NEXT: v_mov_b32_e32 v26, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v48 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 ; SI-NEXT: s_branch .LBB111_3 @@ -236341,33 +236351,53 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: ; kill: killed $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; kill: killed $vgpr41 -; SI-NEXT: v_mov_b32_e32 v61, v30 +; SI-NEXT: v_mov_b32_e32 v61, v32 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; kill: killed $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; kill: killed $vgpr41 -; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: v_mov_b32_e32 v32, v31 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; kill: killed $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; kill: killed $vgpr41 -; SI-NEXT: v_mov_b32_e32 v3, v10 +; SI-NEXT: v_mov_b32_e32 v31, v46 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; kill: killed $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; kill: killed $vgpr41 -; SI-NEXT: v_mov_b32_e32 v2, v9 +; SI-NEXT: v_mov_b32_e32 v60, v30 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; kill: killed $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: v_mov_b32_e32 v63, v29 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: v_mov_b32_e32 v62, v28 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: v_mov_b32_e32 v2, v5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; kill: killed $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; kill: killed $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -236404,22 +236434,6 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: ; kill: killed $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; kill: killed $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; kill: killed $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; kill: killed $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; kill: killed $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; kill: killed $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr4 @@ -236456,10 +236470,10 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v19, v1 ; SI-NEXT: s_cbranch_vccnz .LBB111_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v2 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v2 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_cvt_f32_f16_e32 v41, s16 ; SI-NEXT: s_add_i32 s17, s17, 3 @@ -236514,16 +236528,17 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 ; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v62 -; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v63 -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v61 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v60 -; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v62 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_add_i32 s29, s29, 3 ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 @@ -236539,75 +236554,72 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v4 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v4 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v4 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v4 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v4 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v4 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v4 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v4 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v3 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill @@ -236617,8 +236629,6 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 @@ -236634,7 +236644,6 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s27 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 @@ -236692,6 +236701,7 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s27 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 @@ -236752,7 +236762,7 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll index 3e96ab1d597d6..787372e30e379 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll @@ -3740,7 +3740,7 @@ define inreg <16 x i8> @bitcast_v4i32_to_v16i8_scalar(<4 x i32> inreg %a, i32 in ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s16, 0 -; GFX11-NEXT: s_mov_b32 s18, 0 +; GFX11-NEXT: s_mov_b32 s17, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB25_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_lshr_b32 s8, s3, 24 @@ -3752,10 +3752,10 @@ define inreg <16 x i8> @bitcast_v4i32_to_v16i8_scalar(<4 x i32> inreg %a, i32 in ; GFX11-NEXT: s_lshr_b32 s14, s1, 16 ; GFX11-NEXT: s_lshr_b32 s15, s1, 8 ; GFX11-NEXT: s_lshr_b32 s16, s0, 16 -; GFX11-NEXT: s_lshr_b32 s17, s0, 8 +; GFX11-NEXT: s_lshr_b32 s18, s0, 8 ; GFX11-NEXT: s_lshr_b64 s[4:5], s[2:3], 24 ; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], 24 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s17 ; GFX11-NEXT: s_cbranch_vccnz .LBB25_3 ; GFX11-NEXT: .LBB25_2: ; %cmp.true ; GFX11-NEXT: s_add_i32 s1, s1, 3 @@ -3773,10 +3773,10 @@ define inreg <16 x i8> @bitcast_v4i32_to_v16i8_scalar(<4 x i32> inreg %a, i32 in ; GFX11-NEXT: s_lshr_b32 s14, s1, 16 ; GFX11-NEXT: s_lshr_b32 s15, s1, 8 ; GFX11-NEXT: s_lshr_b32 s16, s0, 16 -; GFX11-NEXT: s_lshr_b32 s17, s0, 8 +; GFX11-NEXT: s_lshr_b32 s18, s0, 8 ; GFX11-NEXT: .LBB25_3: ; %end ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s18 ; GFX11-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s6 ; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s15 ; GFX11-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s13 @@ -3786,7 +3786,7 @@ define inreg <16 x i8> @bitcast_v4i32_to_v16i8_scalar(<4 x i32> inreg %a, i32 in ; GFX11-NEXT: v_dual_mov_b32 v14, s9 :: v_dual_mov_b32 v15, s8 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB25_4: -; GFX11-NEXT: ; implicit-def: $sgpr17 +; GFX11-NEXT: ; implicit-def: $sgpr18 ; GFX11-NEXT: ; implicit-def: $sgpr16 ; GFX11-NEXT: ; implicit-def: $sgpr6 ; GFX11-NEXT: ; implicit-def: $sgpr15 @@ -12304,7 +12304,7 @@ define inreg <16 x i8> @bitcast_v2i64_to_v16i8_scalar(<2 x i64> inreg %a, i32 in ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s16, 0 -; GFX11-NEXT: s_mov_b32 s18, 0 +; GFX11-NEXT: s_mov_b32 s17, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB69_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_lshr_b32 s8, s3, 24 @@ -12316,10 +12316,10 @@ define inreg <16 x i8> @bitcast_v2i64_to_v16i8_scalar(<2 x i64> inreg %a, i32 in ; GFX11-NEXT: s_lshr_b32 s14, s1, 16 ; GFX11-NEXT: s_lshr_b32 s15, s1, 8 ; GFX11-NEXT: s_lshr_b32 s16, s0, 16 -; GFX11-NEXT: s_lshr_b32 s17, s0, 8 +; GFX11-NEXT: s_lshr_b32 s18, s0, 8 ; GFX11-NEXT: s_lshr_b64 s[4:5], s[2:3], 24 ; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], 24 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s17 ; GFX11-NEXT: s_cbranch_vccnz .LBB69_3 ; GFX11-NEXT: .LBB69_2: ; %cmp.true ; GFX11-NEXT: s_add_u32 s0, s0, 3 @@ -12337,10 +12337,10 @@ define inreg <16 x i8> @bitcast_v2i64_to_v16i8_scalar(<2 x i64> inreg %a, i32 in ; GFX11-NEXT: s_lshr_b32 s14, s1, 16 ; GFX11-NEXT: s_lshr_b32 s15, s1, 8 ; GFX11-NEXT: s_lshr_b32 s16, s0, 16 -; GFX11-NEXT: s_lshr_b32 s17, s0, 8 +; GFX11-NEXT: s_lshr_b32 s18, s0, 8 ; GFX11-NEXT: .LBB69_3: ; %end ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s18 ; GFX11-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s6 ; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s15 ; GFX11-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s13 @@ -12350,7 +12350,7 @@ define inreg <16 x i8> @bitcast_v2i64_to_v16i8_scalar(<2 x i64> inreg %a, i32 in ; GFX11-NEXT: v_dual_mov_b32 v14, s9 :: v_dual_mov_b32 v15, s8 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB69_4: -; GFX11-NEXT: ; implicit-def: $sgpr17 +; GFX11-NEXT: ; implicit-def: $sgpr18 ; GFX11-NEXT: ; implicit-def: $sgpr16 ; GFX11-NEXT: ; implicit-def: $sgpr6 ; GFX11-NEXT: ; implicit-def: $sgpr15 @@ -18872,78 +18872,74 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; VI-LABEL: bitcast_v8i16_to_v16i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v19, v3 -; VI-NEXT: v_mov_b32_e32 v18, v2 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v19, v1 +; VI-NEXT: v_mov_b32_e32 v18, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: ; implicit-def: $vgpr16 -; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: ; implicit-def: $vgpr3 -; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr5 ; VI-NEXT: ; implicit-def: $vgpr7 -; VI-NEXT: ; implicit-def: $vgpr8 ; VI-NEXT: ; implicit-def: $vgpr9 -; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr13 ; VI-NEXT: ; implicit-def: $vgpr15 -; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB96_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v19 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v19 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v18 -; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v0 -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[18:19] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: v_mov_b32_e32 v16, v0 -; VI-NEXT: v_mov_b32_e32 v17, v1 -; VI-NEXT: v_mov_b32_e32 v8, v18 -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: ; implicit-def: $vgpr1 -; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 ; VI-NEXT: .LBB96_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB96_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v3, 3 -; VI-NEXT: v_add_u16_sdwa v6, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v2, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v14, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v10, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v17, 3, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; VI-NEXT: v_add_u16_e32 v16, 3, v0 +; VI-NEXT: v_add_u16_sdwa v6, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v20, 3, v19 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; VI-NEXT: v_add_u16_sdwa v2, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v14, v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v10, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v20, v0 +; VI-NEXT: v_add_u16_e32 v19, 3, v18 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; VI-NEXT: v_add_u16_e32 v21, 3, v19 +; VI-NEXT: v_add_u16_e32 v21, 3, v17 ; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v14 -; VI-NEXT: v_add_u16_e32 v8, 3, v18 +; VI-NEXT: v_add_u16_e32 v17, 3, v16 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 -; VI-NEXT: v_or_b32_e32 v1, v17, v1 -; VI-NEXT: v_or_b32_e32 v0, v16, v0 -; VI-NEXT: v_or_b32_e32 v19, v21, v4 -; VI-NEXT: v_or_b32_e32 v18, v8, v3 -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[18:19] +; VI-NEXT: v_or_b32_e32 v0, v19, v0 +; VI-NEXT: v_or_b32_e32 v8, v21, v4 +; VI-NEXT: v_or_b32_e32 v7, v17, v3 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[7:8] ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v19 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v18 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v7 ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; VI-NEXT: v_bfe_u32 v15, v14, 8, 8 ; VI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; VI-NEXT: v_mov_b32_e32 v18, v19 +; VI-NEXT: v_mov_b32_e32 v19, v20 +; VI-NEXT: v_mov_b32_e32 v16, v17 +; VI-NEXT: v_mov_b32_e32 v17, v21 ; VI-NEXT: .LBB96_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v0, v16 -; VI-NEXT: v_mov_b32_e32 v1, v20 -; VI-NEXT: v_mov_b32_e32 v4, v17 -; VI-NEXT: v_mov_b32_e32 v12, v21 +; VI-NEXT: v_mov_b32_e32 v0, v18 +; VI-NEXT: v_mov_b32_e32 v4, v19 +; VI-NEXT: v_mov_b32_e32 v8, v16 +; VI-NEXT: v_mov_b32_e32 v12, v17 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v8i16_to_v16i8: @@ -23829,13 +23825,13 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v3 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[2:3] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v11.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v10.l +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[2:3] ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l ; GFX11-TRUE16-NEXT: .LBB108_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4 @@ -23923,9 +23919,9 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v10 ; GFX11-TRUE16-NEXT: .LBB108_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h @@ -25212,9 +25208,9 @@ define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_readfirstlane_b32 s9, v1 +; SI-NEXT: v_readfirstlane_b32 s8, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s11, v0 +; SI-NEXT: v_readfirstlane_b32 s9, v0 ; SI-NEXT: s_cbranch_scc0 .LBB111_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xff @@ -25228,11 +25224,11 @@ define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i3 ; SI-NEXT: s_and_b32 s4, s20, 0xff ; SI-NEXT: s_lshl_b32 s5, s21, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshl_b32 s8, s4, 16 +; SI-NEXT: s_lshl_b32 s10, s4, 16 ; SI-NEXT: s_and_b32 s4, s22, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s23, 24 -; SI-NEXT: s_or_b32 s10, s5, s4 +; SI-NEXT: s_or_b32 s11, s5, s4 ; SI-NEXT: s_and_b32 s4, s24, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s25, 24 @@ -25245,20 +25241,20 @@ define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_lshl_b32 s14, s4, 16 -; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: s_and_b32 s4, s9, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s9, 24 +; SI-NEXT: s_lshl_b32 s5, s8, 24 ; SI-NEXT: s_or_b32 s15, s5, s4 ; SI-NEXT: s_cbranch_execnz .LBB111_3 ; SI-NEXT: .LBB111_2: ; %cmp.true ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s6, s11, 0xff +; SI-NEXT: s_and_b32 s6, s9, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s9, 24 +; SI-NEXT: s_lshl_b32 s5, s8, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 @@ -25305,8 +25301,8 @@ define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i3 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 ; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s10, s8, 0xffff0000 -; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s11, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s8, 16 ; SI-NEXT: s_and_b32 s13, s5, 0xffff0000 ; SI-NEXT: s_lshl_b32 s12, s5, 16 ; SI-NEXT: s_and_b32 s15, s4, 0xffff0000 @@ -25314,8 +25310,8 @@ define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i3 ; SI-NEXT: .LBB111_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s11 ; SI-NEXT: v_mov_b32_e32 v4, s12 ; SI-NEXT: v_mov_b32_e32 v5, s13 ; SI-NEXT: v_mov_b32_e32 v6, s14 @@ -25324,8 +25320,8 @@ define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i3 ; SI-NEXT: .LBB111_4: ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr14 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll index f8ffaa456c2b3..bde4fdb30e0f9 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll @@ -5622,7 +5622,7 @@ define inreg <32 x i8> @bitcast_v8i32_to_v32i8_scalar(<8 x i32> inreg %a, i32 in ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s20, 0 -; GFX11-NEXT: s_mov_b32 s46, 0 +; GFX11-NEXT: s_mov_b32 s44, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB25_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_lshr_b32 s12, s19, 24 @@ -5643,13 +5643,13 @@ define inreg <32 x i8> @bitcast_v8i32_to_v32i8_scalar(<8 x i32> inreg %a, i32 in ; GFX11-NEXT: s_lshr_b32 s41, s1, 24 ; GFX11-NEXT: s_lshr_b32 s42, s1, 16 ; GFX11-NEXT: s_lshr_b32 s43, s1, 8 -; GFX11-NEXT: s_lshr_b32 s44, s0, 16 -; GFX11-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-NEXT: s_lshr_b32 s46, s0, 8 ; GFX11-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 ; GFX11-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 ; GFX11-NEXT: s_lshr_b64 s[8:9], s[2:3], 24 ; GFX11-NEXT: s_lshr_b64 s[10:11], s[0:1], 24 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s44 ; GFX11-NEXT: s_cbranch_vccnz .LBB25_3 ; GFX11-NEXT: .LBB25_2: ; %cmp.true ; GFX11-NEXT: s_add_i32 s1, s1, 3 @@ -5682,12 +5682,12 @@ define inreg <32 x i8> @bitcast_v8i32_to_v32i8_scalar(<8 x i32> inreg %a, i32 in ; GFX11-NEXT: s_lshr_b32 s41, s1, 24 ; GFX11-NEXT: s_lshr_b32 s42, s1, 16 ; GFX11-NEXT: s_lshr_b32 s43, s1, 8 -; GFX11-NEXT: s_lshr_b32 s44, s0, 16 -; GFX11-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-NEXT: s_lshr_b32 s46, s0, 8 ; GFX11-NEXT: .LBB25_3: ; %end ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s45 -; GFX11-NEXT: v_dual_mov_b32 v2, s44 :: v_dual_mov_b32 v3, s10 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s46 +; GFX11-NEXT: v_dual_mov_b32 v2, s45 :: v_dual_mov_b32 v3, s10 ; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s43 ; GFX11-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s41 ; GFX11-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s40 @@ -5704,8 +5704,8 @@ define inreg <32 x i8> @bitcast_v8i32_to_v32i8_scalar(<8 x i32> inreg %a, i32 in ; GFX11-NEXT: v_dual_mov_b32 v30, s13 :: v_dual_mov_b32 v31, s12 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB25_4: +; GFX11-NEXT: ; implicit-def: $sgpr46 ; GFX11-NEXT: ; implicit-def: $sgpr45 -; GFX11-NEXT: ; implicit-def: $sgpr44 ; GFX11-NEXT: ; implicit-def: $sgpr10 ; GFX11-NEXT: ; implicit-def: $sgpr43 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -19218,7 +19218,7 @@ define inreg <32 x i8> @bitcast_v4i64_to_v32i8_scalar(<4 x i64> inreg %a, i32 in ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s20, 0 -; GFX11-NEXT: s_mov_b32 s46, 0 +; GFX11-NEXT: s_mov_b32 s44, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB69_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_lshr_b32 s12, s19, 24 @@ -19239,13 +19239,13 @@ define inreg <32 x i8> @bitcast_v4i64_to_v32i8_scalar(<4 x i64> inreg %a, i32 in ; GFX11-NEXT: s_lshr_b32 s41, s1, 24 ; GFX11-NEXT: s_lshr_b32 s42, s1, 16 ; GFX11-NEXT: s_lshr_b32 s43, s1, 8 -; GFX11-NEXT: s_lshr_b32 s44, s0, 16 -; GFX11-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-NEXT: s_lshr_b32 s46, s0, 8 ; GFX11-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 ; GFX11-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 ; GFX11-NEXT: s_lshr_b64 s[8:9], s[2:3], 24 ; GFX11-NEXT: s_lshr_b64 s[10:11], s[0:1], 24 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s44 ; GFX11-NEXT: s_cbranch_vccnz .LBB69_3 ; GFX11-NEXT: .LBB69_2: ; %cmp.true ; GFX11-NEXT: s_add_u32 s0, s0, 3 @@ -19278,12 +19278,12 @@ define inreg <32 x i8> @bitcast_v4i64_to_v32i8_scalar(<4 x i64> inreg %a, i32 in ; GFX11-NEXT: s_lshr_b32 s41, s1, 24 ; GFX11-NEXT: s_lshr_b32 s42, s1, 16 ; GFX11-NEXT: s_lshr_b32 s43, s1, 8 -; GFX11-NEXT: s_lshr_b32 s44, s0, 16 -; GFX11-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-NEXT: s_lshr_b32 s46, s0, 8 ; GFX11-NEXT: .LBB69_3: ; %end ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s45 -; GFX11-NEXT: v_dual_mov_b32 v2, s44 :: v_dual_mov_b32 v3, s10 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s46 +; GFX11-NEXT: v_dual_mov_b32 v2, s45 :: v_dual_mov_b32 v3, s10 ; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s43 ; GFX11-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s41 ; GFX11-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s40 @@ -19300,8 +19300,8 @@ define inreg <32 x i8> @bitcast_v4i64_to_v32i8_scalar(<4 x i64> inreg %a, i32 in ; GFX11-NEXT: v_dual_mov_b32 v30, s13 :: v_dual_mov_b32 v31, s12 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB69_4: +; GFX11-NEXT: ; implicit-def: $sgpr46 ; GFX11-NEXT: ; implicit-def: $sgpr45 -; GFX11-NEXT: ; implicit-def: $sgpr44 ; GFX11-NEXT: ; implicit-def: $sgpr10 ; GFX11-NEXT: ; implicit-def: $sgpr43 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -30051,36 +30051,32 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; VI-LABEL: bitcast_v16i16_to_v32i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v35, v3 +; VI-NEXT: v_mov_b32_e32 v34, v2 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 -; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr5 ; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr8 ; VI-NEXT: ; implicit-def: $vgpr9 -; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr13 ; VI-NEXT: ; implicit-def: $vgpr15 -; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr27 @@ -30091,97 +30087,92 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 ; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v5 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v3 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[4:5] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[2:3] -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[0:1] -; VI-NEXT: v_mov_b32_e32 v50, v0 -; VI-NEXT: v_mov_b32_e32 v48, v1 -; VI-NEXT: v_mov_b32_e32 v8, v2 -; VI-NEXT: v_mov_b32_e32 v35, v3 -; VI-NEXT: v_mov_b32_e32 v16, v4 -; VI-NEXT: v_mov_b32_e32 v49, v5 -; VI-NEXT: v_mov_b32_e32 v24, v6 -; VI-NEXT: v_mov_b32_e32 v51, v7 -; VI-NEXT: ; implicit-def: $vgpr1 -; VI-NEXT: ; implicit-def: $vgpr3 -; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 ; VI-NEXT: .LBB96_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB96_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v9, 3 -; VI-NEXT: v_add_u16_sdwa v36, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v32, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v14, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v10, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v22, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v18, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v30, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v26, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v48, 3, v1 +; VI-NEXT: v_mov_b32_e32 v13, 3 +; VI-NEXT: v_add_u16_sdwa v2, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v15, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; VI-NEXT: v_add_u16_sdwa v14, v35, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v15, v0 +; VI-NEXT: v_add_u16_e32 v16, 3, v35 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 +; VI-NEXT: v_add_u16_sdwa v10, v34, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v16, v0 +; VI-NEXT: v_add_u16_e32 v17, 3, v34 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; VI-NEXT: v_add_u16_sdwa v22, v33, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v17, v0 +; VI-NEXT: v_add_u16_e32 v19, 3, v33 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; VI-NEXT: v_add_u16_sdwa v18, v32, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v36, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v19, v0 +; VI-NEXT: v_add_u16_e32 v20, 3, v32 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; VI-NEXT: v_add_u16_sdwa v30, v7, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v5, 3, v1 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 -; VI-NEXT: v_add_u16_e32 v50, 3, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 -; VI-NEXT: v_add_u16_e32 v35, 3, v3 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 -; VI-NEXT: v_add_u16_e32 v8, 3, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; VI-NEXT: v_add_u16_e32 v49, 3, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 -; VI-NEXT: v_add_u16_e32 v16, 3, v4 -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; VI-NEXT: v_add_u16_e32 v51, 3, v7 -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v30 -; VI-NEXT: v_add_u16_e32 v24, 3, v6 -; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v26 -; VI-NEXT: v_or_b32_e32 v1, v48, v1 -; VI-NEXT: v_or_b32_e32 v0, v50, v0 -; VI-NEXT: v_or_b32_e32 v3, v35, v3 -; VI-NEXT: v_or_b32_e32 v2, v8, v2 -; VI-NEXT: v_or_b32_e32 v5, v49, v5 -; VI-NEXT: v_or_b32_e32 v4, v16, v4 -; VI-NEXT: v_or_b32_e32 v7, v51, v7 -; VI-NEXT: v_or_b32_e32 v6, v24, v6 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[4:5] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[2:3] -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[0:1] -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_or_b32_e32 v8, v20, v0 +; VI-NEXT: v_add_u16_e32 v23, 3, v7 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; VI-NEXT: v_add_u16_sdwa v26, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v5, v1 +; VI-NEXT: v_or_b32_e32 v1, v23, v0 +; VI-NEXT: v_add_u16_e32 v7, 3, v6 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; VI-NEXT: v_or_b32_e32 v0, v7, v0 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; VI-NEXT: v_mov_b32_e32 v1, v5 +; VI-NEXT: v_mov_b32_e32 v32, v20 +; VI-NEXT: v_mov_b32_e32 v33, v19 +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[8:9] +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v11 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v3 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v0 +; VI-NEXT: v_mov_b32_e32 v0, v15 +; VI-NEXT: v_mov_b32_e32 v34, v17 +; VI-NEXT: v_mov_b32_e32 v35, v16 +; VI-NEXT: v_mov_b32_e32 v6, v7 +; VI-NEXT: v_mov_b32_e32 v7, v23 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v8 ; VI-NEXT: v_bfe_u32 v31, v30, 8, 8 ; VI-NEXT: v_bfe_u32 v23, v22, 8, 8 ; VI-NEXT: v_bfe_u32 v15, v14, 8, 8 -; VI-NEXT: v_bfe_u32 v39, v36, 8, 8 +; VI-NEXT: v_bfe_u32 v37, v36, 8, 8 ; VI-NEXT: .LBB96_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v0, v50 +; VI-NEXT: v_mov_b32_e32 v4, v1 +; VI-NEXT: v_mov_b32_e32 v8, v34 +; VI-NEXT: v_mov_b32_e32 v12, v35 +; VI-NEXT: v_mov_b32_e32 v16, v32 +; VI-NEXT: v_mov_b32_e32 v20, v33 +; VI-NEXT: v_mov_b32_e32 v24, v6 +; VI-NEXT: v_mov_b32_e32 v28, v7 ; VI-NEXT: v_mov_b32_e32 v1, v38 -; VI-NEXT: v_mov_b32_e32 v2, v32 -; VI-NEXT: v_mov_b32_e32 v3, v33 -; VI-NEXT: v_mov_b32_e32 v4, v48 -; VI-NEXT: v_mov_b32_e32 v5, v37 ; VI-NEXT: v_mov_b32_e32 v6, v36 -; VI-NEXT: v_mov_b32_e32 v7, v39 -; VI-NEXT: v_mov_b32_e32 v12, v35 -; VI-NEXT: v_mov_b32_e32 v20, v49 -; VI-NEXT: v_mov_b32_e32 v28, v51 +; VI-NEXT: v_mov_b32_e32 v7, v37 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v16i16_to_v32i8: @@ -38417,19 +38408,19 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v3 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[2:3] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v27.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v3.h +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[2:3] ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v27.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l ; GFX11-TRUE16-NEXT: .LBB108_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4 @@ -38591,6 +38582,7 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2 ; GFX11-TRUE16-NEXT: .LBB108_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v35.l @@ -40838,10 +40830,10 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a, ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; SI-NEXT: v_readfirstlane_b32 s42, v15 ; SI-NEXT: v_readfirstlane_b32 s43, v14 -; SI-NEXT: v_readfirstlane_b32 s40, v7 -; SI-NEXT: v_readfirstlane_b32 s41, v6 -; SI-NEXT: v_readfirstlane_b32 s10, v1 -; SI-NEXT: v_readfirstlane_b32 s9, v0 +; SI-NEXT: v_readfirstlane_b32 s13, v7 +; SI-NEXT: v_readfirstlane_b32 s15, v6 +; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: v_readfirstlane_b32 s6, v0 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v5 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 @@ -40852,15 +40844,15 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a, ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s17, 24 -; SI-NEXT: s_or_b32 s6, s5, s4 +; SI-NEXT: s_or_b32 s8, s5, s4 ; SI-NEXT: s_and_b32 s4, s18, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s19, 24 -; SI-NEXT: s_or_b32 s7, s5, s4 +; SI-NEXT: s_or_b32 s9, s5, s4 ; SI-NEXT: s_and_b32 s4, s20, 0xff ; SI-NEXT: s_lshl_b32 s5, s21, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshl_b32 s8, s4, 16 +; SI-NEXT: s_lshl_b32 s10, s4, 16 ; SI-NEXT: s_and_b32 s4, s22, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s23, 24 @@ -40878,24 +40870,24 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a, ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s27, 24 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_or_b32 s13, s5, s4 +; SI-NEXT: s_or_b32 s14, s5, s4 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: v_or_b32_e32 v9, v0, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_lshl_b32 s14, s4, 16 -; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s40, s4, 16 +; SI-NEXT: s_and_b32 s4, s6, 0xff ; SI-NEXT: v_or_b32_e32 v19, v1, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v10 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s10, 24 +; SI-NEXT: s_lshl_b32 s5, s7, 24 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v11 -; SI-NEXT: s_or_b32 s15, s5, s4 -; SI-NEXT: s_and_b32 s4, s41, 0xff -; SI-NEXT: s_lshl_b32 s5, s40, 8 +; SI-NEXT: s_or_b32 s41, s5, s4 +; SI-NEXT: s_and_b32 s4, s15, 0xff +; SI-NEXT: s_lshl_b32 s5, s13, 8 ; SI-NEXT: v_or_b32_e32 v18, v13, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v12 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -40931,11 +40923,11 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a, ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v5, v5, v9 -; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: v_or_b32_e32 v6, s4, v6 ; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: s_and_b32 s4, s41, 0xff -; SI-NEXT: s_lshl_b32 s5, s40, 8 +; SI-NEXT: s_and_b32 s4, s15, 0xff +; SI-NEXT: s_lshl_b32 s5, s13, 8 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v8 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 @@ -40947,11 +40939,11 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, s4, v1 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s6, s9, 0xff +; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s10, 24 +; SI-NEXT: s_lshl_b32 s5, s7, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 @@ -40974,36 +40966,35 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a, ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_and_b32 s8, s22, 0xff +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: s_addk_i32 s6, 0x300 ; SI-NEXT: s_lshl_b32 s7, s23, 24 ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v4 -; SI-NEXT: s_add_i32 s8, s6, 0x3000000 -; SI-NEXT: s_and_b32 s6, s16, 0xff -; SI-NEXT: s_lshl_b32 s7, s17, 8 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s16, 0xff +; SI-NEXT: s_lshl_b32 s8, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_and_b32 s9, s18, 0xff ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_addk_i32 s6, 0x300 -; SI-NEXT: s_lshl_b32 s7, s19, 24 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_lshl_b32 s8, s19, 24 ; SI-NEXT: s_lshl_b32 s9, s9, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v5 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 @@ -41011,14 +41002,15 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a, ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 -; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s11, s8, 0xffff0000 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_and_b32 s13, s5, 0xffff0000 +; SI-NEXT: s_add_i32 s7, s7, 0x3000000 +; SI-NEXT: s_and_b32 s9, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s7, 16 +; SI-NEXT: s_and_b32 s11, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s6, 16 +; SI-NEXT: s_and_b32 s14, s5, 0xffff0000 ; SI-NEXT: s_lshl_b32 s12, s5, 16 -; SI-NEXT: s_and_b32 s15, s4, 0xffff0000 -; SI-NEXT: s_lshl_b32 s14, s4, 16 +; SI-NEXT: s_and_b32 s41, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s40, s4, 16 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v0 ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v1 @@ -41029,14 +41021,14 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v6 ; SI-NEXT: s_branch .LBB111_5 ; SI-NEXT: .LBB111_3: -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr41 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $sgpr44 @@ -41050,14 +41042,14 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a, ; SI-NEXT: v_mov_b32_e32 v10, s44 ; SI-NEXT: v_mov_b32_e32 v14, s45 ; SI-NEXT: .LBB111_5: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s11 ; SI-NEXT: v_mov_b32_e32 v4, s12 -; SI-NEXT: v_mov_b32_e32 v5, s13 -; SI-NEXT: v_mov_b32_e32 v6, s14 -; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s40 +; SI-NEXT: v_mov_b32_e32 v7, s41 ; SI-NEXT: v_mov_b32_e32 v8, v17 ; SI-NEXT: v_mov_b32_e32 v11, v19 ; SI-NEXT: v_mov_b32_e32 v12, v18 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll index 0cefbc1c2dee5..5d0a2ca68ed57 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll @@ -4081,7 +4081,7 @@ define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s22, 0 -; GFX11-NEXT: s_mov_b32 s63, 0 +; GFX11-NEXT: s_mov_b32 s61, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB13_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_lshr_b32 s14, s21, 24 @@ -4107,14 +4107,14 @@ define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32 ; GFX11-NEXT: s_lshr_b32 s58, s1, 24 ; GFX11-NEXT: s_lshr_b32 s59, s1, 16 ; GFX11-NEXT: s_lshr_b32 s60, s1, 8 -; GFX11-NEXT: s_lshr_b32 s61, s0, 16 -; GFX11-NEXT: s_lshr_b32 s62, s0, 8 +; GFX11-NEXT: s_lshr_b32 s62, s0, 16 +; GFX11-NEXT: s_lshr_b32 s63, s0, 8 ; GFX11-NEXT: s_lshr_b64 s[4:5], s[20:21], 24 ; GFX11-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 ; GFX11-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 ; GFX11-NEXT: s_lshr_b64 s[10:11], s[2:3], 24 ; GFX11-NEXT: s_lshr_b64 s[12:13], s[0:1], 24 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s63 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s61 ; GFX11-NEXT: s_cbranch_vccnz .LBB13_3 ; GFX11-NEXT: .LBB13_2: ; %cmp.true ; GFX11-NEXT: s_add_i32 s1, s1, 3 @@ -4155,12 +4155,12 @@ define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32 ; GFX11-NEXT: s_lshr_b32 s58, s1, 24 ; GFX11-NEXT: s_lshr_b32 s59, s1, 16 ; GFX11-NEXT: s_lshr_b32 s60, s1, 8 -; GFX11-NEXT: s_lshr_b32 s61, s0, 16 -; GFX11-NEXT: s_lshr_b32 s62, s0, 8 +; GFX11-NEXT: s_lshr_b32 s62, s0, 16 +; GFX11-NEXT: s_lshr_b32 s63, s0, 8 ; GFX11-NEXT: .LBB13_3: ; %end ; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_lshl_b32 s5, s62, 8 -; GFX11-NEXT: s_and_b32 s7, s61, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s63, 8 +; GFX11-NEXT: s_and_b32 s7, s62, 0xff ; GFX11-NEXT: s_lshl_b32 s9, s12, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s5 ; GFX11-NEXT: s_or_b32 s5, s7, s9 @@ -4260,8 +4260,8 @@ define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32 ; GFX11-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB13_4: +; GFX11-NEXT: ; implicit-def: $sgpr63 ; GFX11-NEXT: ; implicit-def: $sgpr62 -; GFX11-NEXT: ; implicit-def: $sgpr61 ; GFX11-NEXT: ; implicit-def: $sgpr12 ; GFX11-NEXT: ; implicit-def: $sgpr60 ; GFX11-NEXT: ; implicit-def: $sgpr59 @@ -15588,10 +15588,9 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 ; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v6 @@ -15604,47 +15603,48 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v20 -; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_2 @@ -15669,10 +15669,10 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v6, v1, v25 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 ; SI-NEXT: v_or_b32_e32 v2, v1, v24 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v4, 8, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 ; SI-NEXT: v_alignbit_b32 v39, v31, v30, 24 ; SI-NEXT: v_alignbit_b32 v48, v31, v30, 16 ; SI-NEXT: v_alignbit_b32 v52, v31, v30, 8 @@ -15688,20 +15688,20 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v26, v2, v6, 24 ; SI-NEXT: v_alignbit_b32 v27, v2, v6, 16 ; SI-NEXT: v_alignbit_b32 v32, v2, v6, 8 -; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v31 -; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v22 -; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v18 -; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v31 +; SI-NEXT: v_lshrrev_b32_e32 v42, 8, v22 +; SI-NEXT: v_lshrrev_b32_e32 v55, 8, v18 ; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v2 -; SI-NEXT: v_and_b32_e32 v45, 0xffff, v8 -; SI-NEXT: v_and_b32_e32 v42, 0xffff, v12 -; SI-NEXT: v_and_b32_e32 v55, 0xffff, v16 -; SI-NEXT: v_and_b32_e32 v51, 0xffff, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: v_bfe_u32 v46, v8, 8, 8 -; SI-NEXT: v_bfe_u32 v43, v12, 8, 8 -; SI-NEXT: v_bfe_u32 v40, v16, 8, 8 -; SI-NEXT: v_bfe_u32 v53, v20, 8, 8 +; SI-NEXT: v_and_b32_e32 v46, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v43, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v40, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v53, 0xffff, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v47, v4, 8, 8 +; SI-NEXT: v_bfe_u32 v44, v8, 8, 8 +; SI-NEXT: v_bfe_u32 v41, v12, 8, 8 +; SI-NEXT: v_bfe_u32 v54, v16, 8, 8 +; SI-NEXT: v_bfe_u32 v51, v20, 8, 8 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 @@ -15723,10 +15723,7 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: .LBB48_2: ; %Flow -; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v13 @@ -15771,6 +15768,7 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v4 ; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v10 ; SI-NEXT: v_alignbit_b32 v39, v31, v30, 24 ; SI-NEXT: v_alignbit_b32 v48, v31, v30, 16 ; SI-NEXT: v_alignbit_b32 v52, v31, v30, 8 @@ -15786,22 +15784,23 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v26, v2, v6, 24 ; SI-NEXT: v_alignbit_b32 v27, v2, v6, 16 ; SI-NEXT: v_alignbit_b32 v32, v2, v6, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 24, v31 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v31 -; SI-NEXT: v_lshrrev_b32_e32 v46, 24, v22 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v22 -; SI-NEXT: v_lshrrev_b32_e32 v43, 24, v18 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v18 -; SI-NEXT: v_lshrrev_b32_e32 v40, 24, v10 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v10 -; SI-NEXT: v_lshrrev_b32_e32 v53, 24, v2 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v31 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v31 +; SI-NEXT: v_lshrrev_b32_e32 v44, 24, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v42, 8, v22 +; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v18 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v55, 8, v18 +; SI-NEXT: v_lshrrev_b32_e32 v54, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: .LBB48_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) @@ -15817,13 +15816,11 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v47 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v45 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v46 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v47 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 @@ -15843,11 +15840,11 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v44 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v42 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v45 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v43 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v46 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v44 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 @@ -15867,11 +15864,11 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v41 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v55 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v42 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v40 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v43 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v41 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 @@ -15889,13 +15886,15 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v54 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v54 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v55 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v53 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v40 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 @@ -15917,13 +15916,15 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v50 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v51 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v53 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v51 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -15958,40 +15959,27 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; implicit-def: $vgpr15 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr14 -; VI-NEXT: ; implicit-def: $vgpr37 ; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr14 ; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr13 -; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr55 ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr12 -; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr28 -; VI-NEXT: ; implicit-def: $vgpr11 -; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB48_2 @@ -16000,37 +15988,22 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] ; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] ; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; VI-NEXT: v_lshrrev_b32_e32 v29, 24, v10 ; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v10 ; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v8 ; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v8 ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v52, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v41, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v1 -; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; VI-NEXT: v_mov_b32_e32 v34, v1 -; VI-NEXT: v_mov_b32_e32 v32, v2 -; VI-NEXT: v_mov_b32_e32 v38, v3 -; VI-NEXT: v_mov_b32_e32 v37, v4 -; VI-NEXT: v_mov_b32_e32 v50, v5 -; VI-NEXT: v_mov_b32_e32 v49, v6 -; VI-NEXT: v_mov_b32_e32 v55, v7 -; VI-NEXT: v_mov_b32_e32 v53, v8 -; VI-NEXT: v_mov_b32_e32 v43, v9 -; VI-NEXT: v_mov_b32_e32 v42, v10 -; VI-NEXT: ; implicit-def: $vgpr1 -; VI-NEXT: ; implicit-def: $vgpr3 -; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: ; implicit-def: $vgpr7 -; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v48, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; VI-NEXT: .LBB48_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB48_4 @@ -16040,137 +16013,144 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v20, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v18, v8, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v22, v7, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v42, 3, v10 +; VI-NEXT: v_add_u16_e32 v55, 3, v10 ; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v17 -; VI-NEXT: v_add_u16_e32 v43, 3, v9 +; VI-NEXT: v_add_u16_e32 v40, 3, v9 ; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 ; VI-NEXT: v_add_u16_sdwa v19, v6, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v24, v5, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v53, 3, v8 ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v18 -; VI-NEXT: v_add_u16_e32 v55, 3, v7 +; VI-NEXT: v_add_u16_e32 v54, 3, v7 ; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v22 -; VI-NEXT: v_or_b32_e32 v10, v42, v10 -; VI-NEXT: v_or_b32_e32 v9, v43, v9 +; VI-NEXT: v_or_b32_e32 v10, v55, v10 +; VI-NEXT: v_or_b32_e32 v9, v40, v9 ; VI-NEXT: v_add_u16_sdwa v23, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v26, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v21, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v25, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v49, 3, v6 +; VI-NEXT: v_add_u16_e32 v51, 3, v6 ; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 -; VI-NEXT: v_add_u16_e32 v50, 3, v5 +; VI-NEXT: v_add_u16_e32 v52, 3, v5 ; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v24 ; VI-NEXT: v_or_b32_e32 v8, v53, v8 -; VI-NEXT: v_or_b32_e32 v7, v55, v7 +; VI-NEXT: v_or_b32_e32 v7, v54, v7 ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] -; VI-NEXT: v_add_u16_e32 v37, 3, v4 +; VI-NEXT: v_add_u16_e32 v49, 3, v4 ; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v21 -; VI-NEXT: v_add_u16_e32 v38, 3, v3 +; VI-NEXT: v_add_u16_e32 v50, 3, v3 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 -; VI-NEXT: v_or_b32_e32 v6, v49, v6 -; VI-NEXT: v_or_b32_e32 v5, v50, v5 +; VI-NEXT: v_or_b32_e32 v6, v51, v6 +; VI-NEXT: v_or_b32_e32 v5, v52, v5 ; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] -; VI-NEXT: v_add_u16_e32 v32, 3, v2 +; VI-NEXT: v_add_u16_e32 v39, 3, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; VI-NEXT: v_add_u16_e32 v34, 3, v1 +; VI-NEXT: v_add_u16_e32 v48, 3, v1 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 -; VI-NEXT: v_or_b32_e32 v4, v37, v4 -; VI-NEXT: v_or_b32_e32 v3, v38, v3 +; VI-NEXT: v_or_b32_e32 v4, v49, v4 +; VI-NEXT: v_or_b32_e32 v3, v50, v3 ; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] -; VI-NEXT: v_or_b32_e32 v2, v32, v2 -; VI-NEXT: v_or_b32_e32 v1, v34, v1 +; VI-NEXT: v_or_b32_e32 v2, v39, v2 +; VI-NEXT: v_or_b32_e32 v1, v48, v1 ; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] ; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v10 ; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v9 ; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v8 ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; VI-NEXT: v_bfe_u32 v29, v17, 8, 8 -; VI-NEXT: v_bfe_u32 v33, v18, 8, 8 -; VI-NEXT: v_bfe_u32 v39, v19, 8, 8 -; VI-NEXT: v_bfe_u32 v52, v21, 8, 8 -; VI-NEXT: v_bfe_u32 v41, v23, 8, 8 +; VI-NEXT: v_bfe_u32 v32, v18, 8, 8 +; VI-NEXT: v_bfe_u32 v35, v19, 8, 8 +; VI-NEXT: v_mov_b32_e32 v1, v48 +; VI-NEXT: v_mov_b32_e32 v2, v39 +; VI-NEXT: v_mov_b32_e32 v3, v50 +; VI-NEXT: v_mov_b32_e32 v4, v49 +; VI-NEXT: v_mov_b32_e32 v5, v52 +; VI-NEXT: v_mov_b32_e32 v6, v51 +; VI-NEXT: v_mov_b32_e32 v7, v54 +; VI-NEXT: v_mov_b32_e32 v8, v53 +; VI-NEXT: v_mov_b32_e32 v9, v40 +; VI-NEXT: v_mov_b32_e32 v10, v55 +; VI-NEXT: v_bfe_u32 v39, v21, 8, 8 +; VI-NEXT: v_bfe_u32 v48, v23, 8, 8 ; VI-NEXT: .LBB48_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v40 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v15 -; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v16 +; VI-NEXT: v_or_b32_sdwa v15, v26, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v54 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v41 -; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v38 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v48 ; VI-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v51 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v37 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v14 -; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v48 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v52 -; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v39 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v21, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v13 -; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v24, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v35 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v39 -; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v33 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v35 +; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v12 -; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v30 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v33 -; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v28 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v11 -; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v29 -; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -19148,8 +19128,8 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 ; SI-NEXT: v_readfirstlane_b32 s14, v19 -; SI-NEXT: v_readfirstlane_b32 s40, v18 -; SI-NEXT: v_readfirstlane_b32 s12, v11 +; SI-NEXT: v_readfirstlane_b32 s15, v18 +; SI-NEXT: v_readfirstlane_b32 s11, v11 ; SI-NEXT: v_readfirstlane_b32 s13, v10 ; SI-NEXT: v_readfirstlane_b32 s8, v3 ; SI-NEXT: v_readfirstlane_b32 s9, v2 @@ -19175,22 +19155,22 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; SI-NEXT: s_lshl_b32 s10, s23, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s10, s5 -; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_or_b32 s12, s4, s5 ; SI-NEXT: s_and_b32 s4, s18, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s10, s19, 24 ; SI-NEXT: s_or_b32 s4, s10, s4 ; SI-NEXT: s_and_b32 s10, s28, 0xff -; SI-NEXT: s_lshl_b32 s15, s29, 8 -; SI-NEXT: s_or_b32 s10, s10, s15 -; SI-NEXT: s_and_b32 s15, s6, 0xff -; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_lshl_b32 s40, s29, 8 +; SI-NEXT: s_or_b32 s10, s10, s40 +; SI-NEXT: s_and_b32 s40, s6, 0xff +; SI-NEXT: s_lshl_b32 s40, s40, 16 ; SI-NEXT: s_lshl_b32 s41, s7, 24 -; SI-NEXT: s_or_b32 s43, s41, s15 -; SI-NEXT: s_and_b32 s15, s26, 0xff -; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_or_b32 s43, s41, s40 +; SI-NEXT: s_and_b32 s40, s26, 0xff +; SI-NEXT: s_lshl_b32 s40, s40, 16 ; SI-NEXT: s_lshl_b32 s41, s27, 24 -; SI-NEXT: s_or_b32 s15, s41, s15 +; SI-NEXT: s_or_b32 s40, s41, s40 ; SI-NEXT: s_and_b32 s41, s16, 0xff ; SI-NEXT: s_lshl_b32 s42, s17, 8 ; SI-NEXT: s_or_b32 s41, s41, s42 @@ -19207,12 +19187,12 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v11, v0, v10 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v5, s40 ; SI-NEXT: v_or_b32_e32 v10, v9, v11 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v4 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v14 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v16 -; SI-NEXT: s_or_b32 s15, s4, s15 +; SI-NEXT: s_or_b32 s40, s4, s40 ; SI-NEXT: s_and_b32 s4, s9, 0xff ; SI-NEXT: s_lshl_b32 s42, s8, 8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 @@ -19229,7 +19209,7 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; SI-NEXT: v_and_b32_e32 v18, 0xff, v24 ; SI-NEXT: v_or_b32_e32 v23, s4, v15 ; SI-NEXT: s_and_b32 s4, s13, 0xff -; SI-NEXT: s_lshl_b32 s42, s12, 8 +; SI-NEXT: s_lshl_b32 s42, s11, 8 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v17, v17, v30 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 @@ -19241,7 +19221,7 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; SI-NEXT: v_or_b32_e32 v18, v17, v32 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v20 ; SI-NEXT: v_or_b32_e32 v26, s4, v21 -; SI-NEXT: s_and_b32 s4, s40, 0xff +; SI-NEXT: s_and_b32 s4, s15, 0xff ; SI-NEXT: s_lshl_b32 s42, s14, 8 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 @@ -19249,7 +19229,7 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s10, s10, s43 ; SI-NEXT: v_or_b32_e32 v33, v31, v17 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_alignbit_b32 v1, s11, v1, 16 +; SI-NEXT: v_alignbit_b32 v1, s12, v1, 16 ; SI-NEXT: v_alignbit_b32 v5, s10, v5, 16 ; SI-NEXT: v_alignbit_b32 v9, v10, v15, 16 ; SI-NEXT: v_alignbit_b32 v13, v25, v21, 16 @@ -19262,8 +19242,8 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v32 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true -; SI-NEXT: s_add_i32 s40, s40, 3 -; SI-NEXT: s_and_b32 s4, s40, 0xff +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_and_b32 s4, s15, 0xff ; SI-NEXT: s_lshl_b32 s5, s14, 8 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 ; SI-NEXT: s_or_b32 s4, s5, s4 @@ -19287,7 +19267,7 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v1 ; SI-NEXT: s_and_b32 s4, s13, 0xff -; SI-NEXT: s_lshl_b32 s5, s12, 8 +; SI-NEXT: s_lshl_b32 s5, s11, 8 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 @@ -19332,7 +19312,7 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s15, s4, 0x3000000 +; SI-NEXT: s_add_i32 s40, s4, 0x3000000 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: s_add_i32 s6, s6, 3 @@ -19382,24 +19362,24 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 -; SI-NEXT: s_add_i32 s11, s4, 0x3000000 +; SI-NEXT: s_add_i32 s12, s4, 0x3000000 ; SI-NEXT: v_mov_b32_e32 v0, s41 -; SI-NEXT: v_alignbit_b32 v1, s11, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s15 +; SI-NEXT: v_alignbit_b32 v1, s12, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s40 ; SI-NEXT: v_alignbit_b32 v5, s10, v0, 16 ; SI-NEXT: v_alignbit_b32 v9, v10, v23, 16 ; SI-NEXT: v_alignbit_b32 v13, v25, v26, 16 ; SI-NEXT: v_alignbit_b32 v17, v18, v21, 16 -; SI-NEXT: s_lshr_b32 s42, s11, 16 +; SI-NEXT: s_lshr_b32 s42, s12, 16 ; SI-NEXT: s_lshr_b32 s43, s10, 16 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v25 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 ; SI-NEXT: .LBB51_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s41 -; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: v_mov_b32_e32 v2, s12 ; SI-NEXT: v_mov_b32_e32 v3, s42 -; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v4, s40 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: v_mov_b32_e32 v7, s43 ; SI-NEXT: v_mov_b32_e32 v8, v23 @@ -19410,9 +19390,9 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; SI-NEXT: .LBB51_4: ; SI-NEXT: ; implicit-def: $sgpr41 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr43 @@ -36195,7 +36175,7 @@ define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 in ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s22, 0 -; GFX11-NEXT: s_mov_b32 s63, 0 +; GFX11-NEXT: s_mov_b32 s61, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB79_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_lshr_b32 s14, s21, 24 @@ -36221,14 +36201,14 @@ define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 in ; GFX11-NEXT: s_lshr_b32 s58, s1, 24 ; GFX11-NEXT: s_lshr_b32 s59, s1, 16 ; GFX11-NEXT: s_lshr_b32 s60, s1, 8 -; GFX11-NEXT: s_lshr_b32 s61, s0, 16 -; GFX11-NEXT: s_lshr_b32 s62, s0, 8 +; GFX11-NEXT: s_lshr_b32 s62, s0, 16 +; GFX11-NEXT: s_lshr_b32 s63, s0, 8 ; GFX11-NEXT: s_lshr_b64 s[4:5], s[20:21], 24 ; GFX11-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 ; GFX11-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 ; GFX11-NEXT: s_lshr_b64 s[10:11], s[2:3], 24 ; GFX11-NEXT: s_lshr_b64 s[12:13], s[0:1], 24 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s63 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s61 ; GFX11-NEXT: s_cbranch_vccnz .LBB79_3 ; GFX11-NEXT: .LBB79_2: ; %cmp.true ; GFX11-NEXT: s_add_u32 s0, s0, 3 @@ -36269,12 +36249,12 @@ define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 in ; GFX11-NEXT: s_lshr_b32 s58, s1, 24 ; GFX11-NEXT: s_lshr_b32 s59, s1, 16 ; GFX11-NEXT: s_lshr_b32 s60, s1, 8 -; GFX11-NEXT: s_lshr_b32 s61, s0, 16 -; GFX11-NEXT: s_lshr_b32 s62, s0, 8 +; GFX11-NEXT: s_lshr_b32 s62, s0, 16 +; GFX11-NEXT: s_lshr_b32 s63, s0, 8 ; GFX11-NEXT: .LBB79_3: ; %end ; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_lshl_b32 s5, s62, 8 -; GFX11-NEXT: s_and_b32 s7, s61, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s63, 8 +; GFX11-NEXT: s_and_b32 s7, s62, 0xff ; GFX11-NEXT: s_lshl_b32 s9, s12, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s5 ; GFX11-NEXT: s_or_b32 s5, s7, s9 @@ -36374,8 +36354,8 @@ define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 in ; GFX11-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB79_4: +; GFX11-NEXT: ; implicit-def: $sgpr63 ; GFX11-NEXT: ; implicit-def: $sgpr62 -; GFX11-NEXT: ; implicit-def: $sgpr61 ; GFX11-NEXT: ; implicit-def: $sgpr12 ; GFX11-NEXT: ; implicit-def: $sgpr60 ; GFX11-NEXT: ; implicit-def: $sgpr59 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll index 48c9b8775a474..944141b7fe2e3 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll @@ -6055,10 +6055,8 @@ define <4 x i8> @bitcast_v2i16_to_v4i8(<2 x i16> %a, i32 %b) { ; VI-LABEL: bitcast_v2i16_to_v4i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -6071,20 +6069,19 @@ define <4 x i8> @bitcast_v2i16_to_v4i8(<2 x i16> %a, i32 %b) { ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB56_3: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v4 -; VI-NEXT: v_mov_b32_e32 v0, v4 -; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: .LBB56_4: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v1, 3 -; VI-NEXT: v_add_u16_sdwa v2, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v0, 3, v4 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; VI-NEXT: v_or_b32_e32 v1, v0, v1 -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_add_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v4, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; VI-NEXT: v_or_b32_e32 v0, v4, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 8, 8 +; VI-NEXT: v_mov_b32_e32 v0, v4 ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -6235,31 +6232,30 @@ define inreg <4 x i8> @bitcast_v2i16_to_v4i8_scalar(<2 x i16> inreg %a, i32 inre ; VI-NEXT: s_cmp_lg_u32 s17, 0 ; VI-NEXT: s_cbranch_scc0 .LBB57_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s8, s16, 24 +; VI-NEXT: s_lshr_b32 s7, s16, 24 ; VI-NEXT: s_lshr_b32 s6, s16, 16 -; VI-NEXT: s_lshr_b32 s9, s16, 8 -; VI-NEXT: s_mov_b32 s7, s16 +; VI-NEXT: s_lshr_b32 s8, s16, 8 ; VI-NEXT: s_cbranch_execnz .LBB57_3 ; VI-NEXT: .LBB57_2: ; %cmp.true -; VI-NEXT: s_lshr_b32 s5, s16, 16 -; VI-NEXT: s_add_i32 s7, s16, 3 -; VI-NEXT: s_add_i32 s6, s5, 3 -; VI-NEXT: s_and_b32 s4, s7, 0xffff -; VI-NEXT: s_lshl_b32 s5, s6, 16 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_lshr_b32 s9, s4, 8 -; VI-NEXT: s_bfe_u32 s8, s6, 0x80008 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_add_i32 s4, s16, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_and_b32 s5, s4, 0xffff +; VI-NEXT: s_lshl_b32 s7, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_lshr_b32 s8, s5, 8 +; VI-NEXT: s_bfe_u32 s7, s6, 0x80008 ; VI-NEXT: .LBB57_3: ; %end -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s8 ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s8 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB57_4: -; VI-NEXT: ; implicit-def: $sgpr7 -; VI-NEXT: ; implicit-def: $sgpr9 -; VI-NEXT: ; implicit-def: $sgpr6 ; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr7 ; VI-NEXT: s_branch .LBB57_2 ; ; GFX9-LABEL: bitcast_v2i16_to_v4i8_scalar: @@ -9059,8 +9055,8 @@ define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v4.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 @@ -9089,6 +9085,7 @@ define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GFX11-TRUE16-NEXT: .LBB76_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index 5aac06a7f3a2b..48719c8846a2c 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -62984,11 +62984,10 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 -; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 @@ -63063,9 +63062,10 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v4 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v10 @@ -63081,8 +63081,8 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v30 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; kill: killed $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr50 @@ -63093,11 +63093,10 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; kill: killed $vgpr2 @@ -63106,8 +63105,8 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; kill: killed $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -63120,207 +63119,204 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v56, v1, v63 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v47, v1, v27 +; SI-NEXT: v_or_b32_e32 v47, v1, v62 ; SI-NEXT: v_alignbit_b32 v1, v47, v56, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v47, v56, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v47, v56, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 ; SI-NEXT: v_or_b32_e32 v50, v1, v37 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v38, v1, v36 ; SI-NEXT: v_alignbit_b32 v1, v38, v50, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v38, v50, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v38, v50, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v34, v1, v48 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v33, v1, v39 ; SI-NEXT: v_alignbit_b32 v1, v33, v34, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v33, v34, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v33, v34, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v32, v1, v51 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 ; SI-NEXT: v_or_b32_e32 v31, v1, v49 ; SI-NEXT: v_alignbit_b32 v1, v31, v32, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v31, v32, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v31, v32, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 ; SI-NEXT: v_or_b32_e32 v30, v1, v53 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 ; SI-NEXT: v_or_b32_e32 v26, v1, v52 ; SI-NEXT: v_alignbit_b32 v1, v26, v30, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v26, v30, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v26, v30, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 ; SI-NEXT: v_or_b32_e32 v22, v1, v55 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 ; SI-NEXT: v_or_b32_e32 v18, v1, v54 +; SI-NEXT: v_alignbit_b32 v1, v18, v22, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v18, v22, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 ; SI-NEXT: v_or_b32_e32 v14, v1, v41 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v60, v18, v22, 24 -; SI-NEXT: v_alignbit_b32 v61, v18, v22, 16 -; SI-NEXT: v_bfe_u32 v62, v44, 8, 8 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 ; SI-NEXT: v_or_b32_e32 v10, v1, v40 ; SI-NEXT: v_alignbit_b32 v1, v10, v14, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 ; SI-NEXT: v_or_b32_e32 v6, v1, v43 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 ; SI-NEXT: v_or_b32_e32 v2, v1, v42 ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v31 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v4, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v8, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v12, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v16, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v20, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v24, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v28, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; SI-NEXT: v_alignbit_b32 v60, v18, v22, 24 ; SI-NEXT: v_alignbit_b32 v57, v10, v14, 24 -; SI-NEXT: v_alignbit_b32 v58, v10, v14, 16 +; SI-NEXT: v_alignbit_b32 v61, v10, v14, 16 ; SI-NEXT: v_alignbit_b32 v45, v2, v6, 24 ; SI-NEXT: v_alignbit_b32 v46, v2, v6, 16 ; SI-NEXT: v_alignbit_b32 v59, v2, v6, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v58, v44, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: .LBB96_2: ; %Flow ; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v20, v61 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_xor_b64 exec, exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB96_4 ; SI-NEXT: ; %bb.3: ; %cmp.true @@ -63330,35 +63326,7 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: v_or_b32_e32 v4, v41, v4 ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 -; SI-NEXT: v_add_i32_e32 v56, vcc, s6, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 -; SI-NEXT: v_add_i32_e32 v47, vcc, s6, v1 -; SI-NEXT: v_alignbit_b32 v1, v47, v56, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v47, v56, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v29 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v43, v2 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v35 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v42, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_alignbit_b32 v45, v2, v6, 24 -; SI-NEXT: v_alignbit_b32 v46, v2, v6, 16 -; SI-NEXT: v_alignbit_b32 v59, v2, v6, 8 -; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v27 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v40, v4 ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v4 @@ -63392,121 +63360,157 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v11 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_or_b32_e32 v4, v39, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_add_i32_e32 v56, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v3 ; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v7 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_add_i32_e32 v47, vcc, s6, v1 ; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_alignbit_b32 v1, v47, v56, 16 ; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v47, v56, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v38, v50, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v38, v50, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v38, v50, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v33, v34, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v33, v34, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v33, v34, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v31, v32, 8 +; SI-NEXT: v_alignbit_b32 v1, v31, v32, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v31, v32, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v31, v32, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v26, v30, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v26, v30, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v18, v22, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v31 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v31 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v31 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v43, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v42, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_alignbit_b32 v4, v47, v56, 24 -; SI-NEXT: v_alignbit_b32 v24, v31, v32, 24 -; SI-NEXT: v_alignbit_b32 v28, v31, v32, 16 -; SI-NEXT: v_alignbit_b32 v12, v26, v30, 24 -; SI-NEXT: v_alignbit_b32 v16, v26, v30, 16 -; SI-NEXT: v_alignbit_b32 v44, v26, v30, 8 +; SI-NEXT: v_alignbit_b32 v44, v26, v30, 24 ; SI-NEXT: v_alignbit_b32 v60, v18, v22, 24 ; SI-NEXT: v_alignbit_b32 v61, v18, v22, 16 -; SI-NEXT: v_alignbit_b32 v20, v18, v22, 8 ; SI-NEXT: v_alignbit_b32 v57, v10, v14, 24 -; SI-NEXT: v_alignbit_b32 v58, v10, v14, 16 -; SI-NEXT: v_alignbit_b32 v8, v10, v14, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v20, v10, v14, 16 +; SI-NEXT: v_alignbit_b32 v28, v10, v14, 8 +; SI-NEXT: v_alignbit_b32 v45, v2, v6, 24 +; SI-NEXT: v_alignbit_b32 v46, v2, v6, 16 +; SI-NEXT: v_alignbit_b32 v59, v2, v6, 8 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: .LBB96_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -63514,7 +63518,7 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 @@ -63522,14 +63526,14 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -63540,14 +63544,14 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 @@ -63558,14 +63562,14 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -63576,14 +63580,14 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 @@ -63594,14 +63598,14 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -63612,28 +63616,32 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v24 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -63644,26 +63652,30 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v44 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v12 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 @@ -63674,26 +63686,28 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v20 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v61 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v60 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 @@ -63706,9 +63720,9 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v28 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v58 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v20 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v57 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -63716,14 +63730,14 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 @@ -63748,13 +63762,13 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v62 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v58 ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 @@ -63783,28 +63797,27 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; ; VI-LABEL: bitcast_v32i16_to_v64i8: ; VI: ; %bb.0: -; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 ; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; kill: killed $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; kill: killed $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; kill: killed $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; kill: killed $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; kill: killed $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; kill: killed $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; kill: killed $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 ; VI-NEXT: ; kill: killed $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; kill: killed $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -63821,415 +63834,312 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; kill: killed $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; kill: killed $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 -; VI-NEXT: ; kill: killed $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; kill: killed $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr61 ; VI-NEXT: ; implicit-def: $vgpr60 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; kill: killed $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr62 -; VI-NEXT: ; kill: killed $vgpr19 +; VI-NEXT: ; kill: killed $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB96_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v16 -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v15 -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v14 -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v14 -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v13 -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v12 -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v12 -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v11 -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v10 -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v8 -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v8 -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v7 -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v6 -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[15:16] -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v5 -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[13:14] -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] ; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] ; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] ; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] -; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] -; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] -; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v16 -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v1 -; VI-NEXT: v_mov_b32_e32 v25, v50 -; VI-NEXT: v_mov_b32_e32 v41, v1 -; VI-NEXT: v_mov_b32_e32 v54, v2 -; VI-NEXT: v_mov_b32_e32 v57, v3 -; VI-NEXT: v_mov_b32_e32 v47, v4 -; VI-NEXT: v_mov_b32_e32 v61, v5 -; VI-NEXT: v_mov_b32_e32 v60, v6 -; VI-NEXT: v_mov_b32_e32 v52, v7 -; VI-NEXT: v_mov_b32_e32 v63, v8 -; VI-NEXT: v_mov_b32_e32 v40, v9 -; VI-NEXT: v_mov_b32_e32 v53, v10 -; VI-NEXT: v_mov_b32_e32 v17, v11 -; VI-NEXT: v_mov_b32_e32 v44, v12 -; VI-NEXT: v_mov_b32_e32 v58, v13 -; VI-NEXT: v_mov_b32_e32 v56, v14 -; VI-NEXT: v_mov_b32_e32 v50, v15 -; VI-NEXT: v_mov_b32_e32 v62, v16 -; VI-NEXT: ; implicit-def: $vgpr1 -; VI-NEXT: ; implicit-def: $vgpr3 -; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: ; implicit-def: $vgpr7 -; VI-NEXT: ; implicit-def: $vgpr9 -; VI-NEXT: ; implicit-def: $vgpr11 -; VI-NEXT: ; implicit-def: $vgpr13 -; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v27, 24, v16 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v28, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v58, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v60, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v63, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v2 +; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v1 ; VI-NEXT: .LBB96_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB96_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v18, 3 -; VI-NEXT: v_add_u16_sdwa v26, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v29, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v62, 3, v16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v26 -; VI-NEXT: v_add_u16_e32 v50, 3, v15 -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 -; VI-NEXT: v_or_b32_e32 v16, v62, v16 -; VI-NEXT: v_or_b32_e32 v15, v50, v15 -; VI-NEXT: v_add_u16_sdwa v38, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v49, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v36, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v48, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v34, v6, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v39, v5, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v32, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v37, v7, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v30, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v35, v9, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v28, v12, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v33, v11, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v27, v14, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v31, v13, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[15:16] -; VI-NEXT: v_add_u16_e32 v56, 3, v14 -; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v27 -; VI-NEXT: v_add_u16_e32 v58, 3, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v31 -; VI-NEXT: v_or_b32_e32 v14, v56, v14 -; VI-NEXT: v_or_b32_e32 v13, v58, v13 -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v17, 3 +; VI-NEXT: v_add_u16_sdwa v50, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v53, 3, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; VI-NEXT: v_add_u16_sdwa v52, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v28, v53, v2 +; VI-NEXT: v_add_u16_e32 v2, 3, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; VI-NEXT: v_add_u16_sdwa v48, v4, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v27, v2, v1 +; VI-NEXT: v_add_u16_e32 v54, 3, v4 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; VI-NEXT: v_add_u16_sdwa v51, v3, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v24, v54, v1 +; VI-NEXT: v_add_u16_e32 v4, 3, v3 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; VI-NEXT: v_add_u16_sdwa v37, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v23, v4, v1 +; VI-NEXT: v_add_u16_e32 v56, 3, v6 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; VI-NEXT: v_add_u16_sdwa v49, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v26, v56, v1 +; VI-NEXT: v_add_u16_e32 v6, 3, v5 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; VI-NEXT: v_add_u16_sdwa v35, v8, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v25, v6, v1 +; VI-NEXT: v_add_u16_e32 v57, 3, v8 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v35 +; VI-NEXT: v_add_u16_sdwa v39, v7, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v47, v57, v1 +; VI-NEXT: v_add_u16_e32 v8, 3, v7 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; VI-NEXT: v_add_u16_sdwa v33, v10, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v46, v8, v1 +; VI-NEXT: v_add_u16_e32 v58, 3, v10 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; VI-NEXT: v_add_u16_sdwa v38, v9, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v58, v1 +; VI-NEXT: v_add_u16_e32 v59, 3, v9 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v38 +; VI-NEXT: v_add_u16_sdwa v31, v12, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v59, v1 +; VI-NEXT: v_add_u16_e32 v60, 3, v12 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; VI-NEXT: v_add_u16_sdwa v36, v11, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v60, v1 +; VI-NEXT: v_add_u16_e32 v61, 3, v11 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; VI-NEXT: v_add_u16_sdwa v30, v14, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v61, v1 +; VI-NEXT: v_add_u16_e32 v62, 3, v14 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; VI-NEXT: v_add_u16_sdwa v34, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v62, v1 +; VI-NEXT: v_add_u16_e32 v63, 3, v13 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v34 +; VI-NEXT: v_add_u16_sdwa v29, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v63, v1 +; VI-NEXT: v_add_u16_e32 v55, 3, v16 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 +; VI-NEXT: v_add_u16_sdwa v32, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v16, v55, v1 +; VI-NEXT: v_add_u16_e32 v40, 3, v15 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; VI-NEXT: v_or_b32_e32 v15, v40, v1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] -; VI-NEXT: v_add_u16_e32 v44, 3, v12 -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v28 -; VI-NEXT: v_add_u16_e32 v17, 3, v11 -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v33 -; VI-NEXT: v_add_u16_e32 v53, 3, v10 -; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 -; VI-NEXT: v_add_u16_e32 v40, 3, v9 -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v35 -; VI-NEXT: v_or_b32_e32 v12, v44, v12 -; VI-NEXT: v_or_b32_e32 v11, v17, v11 -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: v_add_u16_e32 v63, 3, v8 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 -; VI-NEXT: v_add_u16_e32 v52, 3, v7 -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v37 -; VI-NEXT: v_or_b32_e32 v10, v53, v10 -; VI-NEXT: v_or_b32_e32 v9, v40, v9 ; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] -; VI-NEXT: v_add_u16_e32 v60, 3, v6 -; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v34 -; VI-NEXT: v_add_u16_e32 v61, 3, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v39 -; VI-NEXT: v_or_b32_e32 v8, v63, v8 -; VI-NEXT: v_or_b32_e32 v7, v52, v7 ; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] -; VI-NEXT: v_add_u16_e32 v47, 3, v4 -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 -; VI-NEXT: v_add_u16_e32 v57, 3, v3 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 -; VI-NEXT: v_or_b32_e32 v6, v60, v6 -; VI-NEXT: v_or_b32_e32 v5, v61, v5 -; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] -; VI-NEXT: v_add_u16_e32 v54, 3, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; VI-NEXT: v_add_u16_e32 v41, 3, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; VI-NEXT: v_or_b32_e32 v4, v47, v4 -; VI-NEXT: v_or_b32_e32 v3, v57, v3 -; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] -; VI-NEXT: v_or_b32_e32 v2, v54, v2 -; VI-NEXT: v_or_b32_e32 v1, v41, v1 -; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] -; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v1 -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] -; VI-NEXT: v_bfe_u32 v1, v27, 8, 8 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v13 -; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v28, 8, 8 -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v6 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v30, 8, 8 -; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v14 -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v10 -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v2 -; VI-NEXT: v_bfe_u32 v25, v26, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v32, 8, 8 -; VI-NEXT: v_bfe_u32 v43, v34, 8, 8 -; VI-NEXT: v_bfe_u32 v46, v36, 8, 8 -; VI-NEXT: v_bfe_u32 v59, v38, 8, 8 -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[46:47] +; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v14 +; VI-NEXT: v_mov_b32_e32 v14, v62 +; VI-NEXT: v_mov_b32_e32 v15, v40 +; VI-NEXT: v_mov_b32_e32 v40, v16 +; VI-NEXT: v_mov_b32_e32 v16, v55 +; VI-NEXT: v_mov_b32_e32 v55, v22 +; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v23 +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[23:24] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v47 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v26 +; VI-NEXT: v_mov_b32_e32 v9, v59 +; VI-NEXT: v_mov_b32_e32 v11, v61 +; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v25 +; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[25:26] +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v24 +; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[27:28] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v46 +; VI-NEXT: v_mov_b32_e32 v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, v53 +; VI-NEXT: v_mov_b32_e32 v3, v4 +; VI-NEXT: v_mov_b32_e32 v4, v54 +; VI-NEXT: v_mov_b32_e32 v5, v6 +; VI-NEXT: v_mov_b32_e32 v6, v56 +; VI-NEXT: v_mov_b32_e32 v7, v8 +; VI-NEXT: v_mov_b32_e32 v8, v57 +; VI-NEXT: v_mov_b32_e32 v10, v58 +; VI-NEXT: v_mov_b32_e32 v12, v60 +; VI-NEXT: v_mov_b32_e32 v13, v63 +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v28 +; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v27 +; VI-NEXT: v_bfe_u32 v27, v29, 8, 8 +; VI-NEXT: v_bfe_u32 v28, v30, 8, 8 +; VI-NEXT: v_bfe_u32 v56, v31, 8, 8 +; VI-NEXT: v_bfe_u32 v57, v33, 8, 8 +; VI-NEXT: v_bfe_u32 v58, v35, 8, 8 +; VI-NEXT: v_bfe_u32 v60, v37, 8, 8 +; VI-NEXT: v_bfe_u32 v63, v48, 8, 8 +; VI-NEXT: v_bfe_u32 v54, v50, 8, 8 ; VI-NEXT: .LBB96_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v18 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v24 -; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v26 +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v23 +; VI-NEXT: v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v52, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v53 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v45 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v59 -; VI-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; VI-NEXT: v_or_b32_sdwa v2, v2, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v51 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v23 -; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v22 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v62 +; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v55 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v46 -; VI-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v61 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v63 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v22 -; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v59 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v25 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v43 -; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v47 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v60 +; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v46 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v21 -; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v45 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v58 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v44 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v20 -; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v43 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v19 -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v41 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v56 +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18 +; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v28 +; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v27, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v29, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v25 -; VI-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v55 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v27 +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v29, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -69561,7 +69471,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 -; SI-NEXT: v_readfirstlane_b32 s15, v27 +; SI-NEXT: v_readfirstlane_b32 s14, v27 ; SI-NEXT: v_readfirstlane_b32 s40, v26 ; SI-NEXT: v_readfirstlane_b32 s12, v19 ; SI-NEXT: v_readfirstlane_b32 s13, v18 @@ -69610,17 +69520,17 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s22, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s14, s23, 24 +; SI-NEXT: s_lshl_b32 s15, s23, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s14, s5 +; SI-NEXT: s_or_b32 s5, s15, s5 ; SI-NEXT: s_or_b32 s41, s4, s5 ; SI-NEXT: s_and_b32 s4, s18, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s14, s19, 24 -; SI-NEXT: s_or_b32 s4, s14, s4 -; SI-NEXT: s_and_b32 s14, s28, 0xff +; SI-NEXT: s_lshl_b32 s15, s19, 24 +; SI-NEXT: s_or_b32 s4, s15, s4 +; SI-NEXT: s_and_b32 s15, s28, 0xff ; SI-NEXT: s_lshl_b32 s46, s29, 8 -; SI-NEXT: s_or_b32 s14, s14, s46 +; SI-NEXT: s_or_b32 s15, s15, s46 ; SI-NEXT: s_and_b32 s46, s6, 0xff ; SI-NEXT: s_lshl_b32 s46, s46, 16 ; SI-NEXT: s_lshl_b32 s47, s7, 24 @@ -69709,7 +69619,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_or_b32_e32 v63, v59, v34 ; SI-NEXT: v_or_b32_e32 v39, s4, v25 ; SI-NEXT: s_and_b32 s4, s40, 0xff -; SI-NEXT: s_lshl_b32 s56, s15, 8 +; SI-NEXT: s_lshl_b32 s56, s14, 8 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v48, v32, v63 ; SI-NEXT: v_and_b32_e32 v32, 0xff, v57 @@ -69732,12 +69642,12 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_or_b32_e32 v33, s4, v33 ; SI-NEXT: s_and_b32 s4, s45, 0xff ; SI-NEXT: s_lshl_b32 s56, s44, 8 -; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_and_b32 s15, s15, 0xffff ; SI-NEXT: s_or_b32 s4, s4, s56 -; SI-NEXT: s_or_b32 s14, s14, s57 +; SI-NEXT: s_or_b32 s15, s15, s57 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_alignbit_b32 v1, s41, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v5, s15, v5, 16 ; SI-NEXT: v_alignbit_b32 v9, v10, v15, 16 ; SI-NEXT: v_alignbit_b32 v13, v36, v23, 16 ; SI-NEXT: v_alignbit_b32 v21, v38, v29, 16 @@ -69803,7 +69713,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_add_i32 s40, s40, 3 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x3000000, v1 ; SI-NEXT: s_and_b32 s4, s40, 0xff -; SI-NEXT: s_lshl_b32 s5, s15, 8 +; SI-NEXT: s_lshl_b32 s5, s14, 8 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v28 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 @@ -69907,7 +69817,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s14, s4, 0x3000000 +; SI-NEXT: s_add_i32 s15, s4, 0x3000000 ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 @@ -69948,7 +69858,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v0, s47 ; SI-NEXT: v_alignbit_b32 v1, s41, v0, 16 ; SI-NEXT: v_mov_b32_e32 v0, s46 -; SI-NEXT: v_alignbit_b32 v5, s14, v0, 16 +; SI-NEXT: v_alignbit_b32 v5, s15, v0, 16 ; SI-NEXT: v_alignbit_b32 v9, v10, v35, 16 ; SI-NEXT: v_alignbit_b32 v13, v36, v37, 16 ; SI-NEXT: v_alignbit_b32 v17, v18, v39, 16 @@ -69956,7 +69866,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_alignbit_b32 v25, v26, v33, 16 ; SI-NEXT: v_alignbit_b32 v29, v48, v34, 16 ; SI-NEXT: s_lshr_b32 s56, s41, 16 -; SI-NEXT: s_lshr_b32 s57, s14, 16 +; SI-NEXT: s_lshr_b32 s57, s15, 16 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v36 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 @@ -69985,7 +69895,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v2, s41 ; SI-NEXT: v_mov_b32_e32 v3, s56 ; SI-NEXT: v_mov_b32_e32 v4, s46 -; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_mov_b32_e32 v6, s15 ; SI-NEXT: v_mov_b32_e32 v7, s57 ; SI-NEXT: v_mov_b32_e32 v8, v35 ; SI-NEXT: v_mov_b32_e32 v12, v37 @@ -70005,7 +69915,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr46 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr15 ; SI-NEXT: ; implicit-def: $sgpr57 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr9 @@ -70112,11 +70022,11 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v39 +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v39 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v48 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v49 +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v49 ; VI-NEXT: s_cbranch_scc0 .LBB99_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -70175,10 +70085,10 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_or_b32_sdwa v1, v43, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v45, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v45, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v47, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v57, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v57, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v32, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 @@ -70238,9 +70148,9 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v24 ; VI-NEXT: v_or_b32_sdwa v9, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v23 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v57 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45 ; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x300, v3 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v55 ; VI-NEXT: v_or_b32_sdwa v8, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -70297,8 +70207,8 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_addk_i32 s6, 0x300 ; VI-NEXT: s_addk_i32 s8, 0x300 ; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v57 ; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45 ; VI-NEXT: s_addk_i32 s4, 0x300 ; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_lshl_b32 s7, s7, 16 @@ -70306,8 +70216,8 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_and_b32 s10, s10, 0xffff ; VI-NEXT: s_and_b32 s8, s8, 0xffff ; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_or_b32 s9, s9, s10 ; VI-NEXT: s_or_b32 s7, s7, s8 ; VI-NEXT: s_or_b32 s5, s5, s6 @@ -78314,7 +78224,6 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v52, s44 -; VI-NEXT: v_mov_b32_e32 v19, s67 ; VI-NEXT: v_mov_b32_e32 v12, s66 ; VI-NEXT: v_mov_b32_e32 v20, s65 ; VI-NEXT: v_mov_b32_e32 v13, s64 @@ -78373,6 +78282,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v4, s14 ; VI-NEXT: v_mov_b32_e32 v3, s40 ; VI-NEXT: v_mov_b32_e32 v9, s75 +; VI-NEXT: v_mov_b32_e32 v19, s67 ; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v52, s62 @@ -82316,11 +82226,11 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v39 +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v39 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v48 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v49 +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v49 ; VI-NEXT: s_cbranch_scc0 .LBB107_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -82379,10 +82289,10 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_or_b32_sdwa v1, v43, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v45, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v45, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v47, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v57, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v57, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v32, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 @@ -82442,9 +82352,9 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v24 ; VI-NEXT: v_or_b32_sdwa v9, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v23 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v57 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45 ; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x300, v3 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v55 ; VI-NEXT: v_or_b32_sdwa v8, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -82501,8 +82411,8 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_addk_i32 s6, 0x300 ; VI-NEXT: s_addk_i32 s8, 0x300 ; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v57 ; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45 ; VI-NEXT: s_addk_i32 s4, 0x300 ; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_lshl_b32 s7, s7, 16 @@ -82510,8 +82420,8 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_and_b32 s10, s10, 0xffff ; VI-NEXT: s_and_b32 s8, s8, 0xffff ; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_or_b32 s9, s9, s10 ; VI-NEXT: s_or_b32 s7, s7, s8 ; VI-NEXT: s_or_b32 s5, s5, s6 @@ -85132,6 +85042,8 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; kill: killed $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; kill: killed $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -85148,47 +85060,46 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr27 -; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr23 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr29 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr61 ; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr45 ; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr55 ; GFX9-NEXT: ; kill: killed $vgpr17 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; implicit-def: $vgpr19 @@ -85204,51 +85115,52 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v12 ; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] ; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] ; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v14 +; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 24, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 24, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 24, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v1 -; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] -; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 8, v1 +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[28:29], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[26:27], 24, v[1:2] ; GFX9-NEXT: .LBB108_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB108_4 @@ -85283,7 +85195,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_mov_b32 s7, 0x7060302 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v19, v20, vcc -; GFX9-NEXT: v_perm_b32 v27, v1, v18, s7 +; GFX9-NEXT: v_perm_b32 v23, v1, v18, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v4 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_bfe_u32 v19, v1, 16, 1 @@ -85312,7 +85224,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v21, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v20, v21, vcc -; GFX9-NEXT: v_perm_b32 v29, v3, v19, s7 +; GFX9-NEXT: v_perm_b32 v30, v3, v19, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; GFX9-NEXT: v_bfe_u32 v20, v3, 16, 1 @@ -85341,7 +85253,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v22, vcc -; GFX9-NEXT: v_perm_b32 v31, v5, v20, s7 +; GFX9-NEXT: v_perm_b32 v32, v5, v20, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v8 ; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 @@ -85361,278 +85273,281 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_bfe_u32 v22, v21, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX9-NEXT: v_add3_u32 v22, v22, v21, s6 -; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v21 +; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v21 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v21, v22, v25, vcc ; GFX9-NEXT: v_bfe_u32 v22, v7, 16, 1 ; GFX9-NEXT: v_add3_u32 v22, v22, v7, s6 -; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v7 +; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v22, v23, vcc -; GFX9-NEXT: v_perm_b32 v33, v7, v21, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v22, v25, vcc +; GFX9-NEXT: v_perm_b32 v34, v7, v21, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v10 ; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; GFX9-NEXT: v_bfe_u32 v22, v7, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX9-NEXT: v_add3_u32 v22, v22, v7, s6 -; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v7 +; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v22, v23, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v22, v25, vcc ; GFX9-NEXT: v_bfe_u32 v22, v10, 16, 1 ; GFX9-NEXT: v_add3_u32 v22, v22, v10, s6 -; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v10 +; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v10 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v22, v23, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v10, v22, v25, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v9 ; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX9-NEXT: v_bfe_u32 v23, v22, 16, 1 +; GFX9-NEXT: v_bfe_u32 v25, v22, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_add3_u32 v23, v23, v22, s6 -; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v22 +; GFX9-NEXT: v_add3_u32 v25, v25, v22, s6 +; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc -; GFX9-NEXT: v_bfe_u32 v23, v9, 16, 1 -; GFX9-NEXT: v_add3_u32 v23, v23, v9, s6 -; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v25, v26, vcc +; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 +; GFX9-NEXT: v_add3_u32 v25, v25, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v23, v24, vcc -; GFX9-NEXT: v_perm_b32 v35, v9, v22, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v26, vcc +; GFX9-NEXT: v_perm_b32 v36, v9, v22, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v12 ; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX9-NEXT: v_bfe_u32 v23, v9, 16, 1 +; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_add3_u32 v23, v23, v9, s6 -; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v9 +; GFX9-NEXT: v_add3_u32 v25, v25, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v23, v24, vcc -; GFX9-NEXT: v_bfe_u32 v23, v12, 16, 1 -; GFX9-NEXT: v_add3_u32 v23, v23, v12, s6 -; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v26, vcc +; GFX9-NEXT: v_bfe_u32 v25, v12, 16, 1 +; GFX9-NEXT: v_add3_u32 v25, v25, v12, s6 +; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v12 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v23, v24, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v11 -; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GFX9-NEXT: v_bfe_u32 v24, v23, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v25, v26, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_bfe_u32 v26, v25, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_add3_u32 v24, v24, v23, s6 -; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v23 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_add3_u32 v26, v26, v25, s6 +; GFX9-NEXT: v_or_b32_e32 v27, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc -; GFX9-NEXT: v_bfe_u32 v24, v11, 16, 1 -; GFX9-NEXT: v_add3_u32 v24, v24, v11, s6 -; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc +; GFX9-NEXT: v_bfe_u32 v26, v11, 16, 1 +; GFX9-NEXT: v_add3_u32 v26, v26, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v27, 0x400000, v11 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v24, v25, vcc -; GFX9-NEXT: v_perm_b32 v37, v11, v23, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v26, v27, vcc +; GFX9-NEXT: v_perm_b32 v38, v11, v25, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v14 ; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GFX9-NEXT: v_bfe_u32 v24, v11, 16, 1 +; GFX9-NEXT: v_bfe_u32 v26, v11, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_add3_u32 v24, v24, v11, s6 -; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v11 +; GFX9-NEXT: v_add3_u32 v26, v26, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v27, 0x400000, v11 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v24, v25, vcc -; GFX9-NEXT: v_bfe_u32 v24, v14, 16, 1 -; GFX9-NEXT: v_add3_u32 v24, v24, v14, s6 -; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v26, v27, vcc +; GFX9-NEXT: v_bfe_u32 v26, v14, 16, 1 +; GFX9-NEXT: v_add3_u32 v26, v26, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v27, 0x400000, v14 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v24, v25, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v13 -; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX9-NEXT: v_bfe_u32 v25, v24, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v26, v27, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_bfe_u32 v27, v26, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_add3_u32 v25, v25, v24, s6 -; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v24 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_add3_u32 v27, v27, v26, s6 +; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc -; GFX9-NEXT: v_bfe_u32 v25, v13, 16, 1 -; GFX9-NEXT: v_add3_u32 v25, v25, v13, s6 -; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc +; GFX9-NEXT: v_bfe_u32 v27, v13, 16, 1 +; GFX9-NEXT: v_add3_u32 v27, v27, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v13 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v25, v26, vcc -; GFX9-NEXT: v_perm_b32 v48, v13, v24, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v27, v28, vcc +; GFX9-NEXT: v_perm_b32 v48, v13, v26, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v16 ; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX9-NEXT: v_bfe_u32 v25, v13, 16, 1 +; GFX9-NEXT: v_bfe_u32 v27, v13, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX9-NEXT: v_add3_u32 v25, v25, v13, s6 -; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v13 +; GFX9-NEXT: v_add3_u32 v27, v27, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v13 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v25, v26, vcc -; GFX9-NEXT: v_bfe_u32 v25, v16, 16, 1 -; GFX9-NEXT: v_add3_u32 v25, v25, v16, s6 -; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v27, v28, vcc +; GFX9-NEXT: v_bfe_u32 v27, v16, 16, 1 +; GFX9-NEXT: v_add3_u32 v27, v27, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v16 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v25, v26, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v15 -; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GFX9-NEXT: v_bfe_u32 v26, v25, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v27, v28, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v15 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_bfe_u32 v28, v27, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_add3_u32 v26, v26, v25, s6 -; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v25 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_add3_u32 v28, v28, v27, s6 +; GFX9-NEXT: v_or_b32_e32 v29, 0x400000, v27 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v26, v39, vcc -; GFX9-NEXT: v_bfe_u32 v26, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v26, v26, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc +; GFX9-NEXT: v_bfe_u32 v28, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v28, v28, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v29, 0x400000, v15 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v26, v39, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v15, v28, v29, vcc ; GFX9-NEXT: v_perm_b32 v51, v16, v13, s7 -; GFX9-NEXT: v_perm_b32 v50, v15, v25, s7 -; GFX9-NEXT: v_perm_b32 v28, v2, v17, s7 -; GFX9-NEXT: v_perm_b32 v30, v4, v1, s7 +; GFX9-NEXT: v_perm_b32 v50, v15, v27, s7 +; GFX9-NEXT: v_perm_b32 v24, v2, v17, s7 +; GFX9-NEXT: v_perm_b32 v31, v4, v1, s7 ; GFX9-NEXT: v_perm_b32 v49, v14, v11, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v18 ; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[50:51] -; GFX9-NEXT: v_perm_b32 v32, v6, v3, s7 -; GFX9-NEXT: v_perm_b32 v38, v12, v9, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; GFX9-NEXT: v_perm_b32 v33, v6, v3, s7 +; GFX9-NEXT: v_perm_b32 v39, v12, v9, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v19 ; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[48:49] -; GFX9-NEXT: v_perm_b32 v34, v8, v5, s7 -; GFX9-NEXT: v_perm_b32 v36, v10, v7, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v20 -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[37:38] -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v21 -; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[35:36] +; GFX9-NEXT: v_perm_b32 v35, v8, v5, s7 +; GFX9-NEXT: v_perm_b32 v37, v10, v7, s7 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v22 -; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[33:34] +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v20 +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[38:39] +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v26 ; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v23 -; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[29:30] +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v21 +; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[36:37] +; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[30:31] ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v25 -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v24 -; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[31:32] -; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[27:28] -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v51 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v51 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v50 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v50 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v27 +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v22 +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b64 v[28:29], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[26:27], 24, v[23:24] +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 24, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 24, v49 ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v48 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 24, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 24, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v38 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v37 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v37 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v36 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 24, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 24, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v33 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v32 ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v31 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v31 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 24, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 24, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 8, v23 ; GFX9-NEXT: .LBB108_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v27 -; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v23 -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v23, v28, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v51 +; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v53 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v26 +; GFX9-NEXT: v_or_b32_sdwa v22, v24, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v50 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52 +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v22 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v29 -; GFX9-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v30 +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v30 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v63 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v50 ; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v32 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v28 ; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v32 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v33 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v62 ; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v33 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v21 ; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v61, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v59 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v35 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v60 ; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v35 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v20 ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v56 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v37 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v57 ; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v37 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v38 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v19 ; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v38 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v46 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v47 ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -85641,26 +85556,26 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v48 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v49 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v43 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v44 ; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v42 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 ; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v55 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v54 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v55 ; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -85689,62 +85604,62 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v17 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -85757,19 +85672,19 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v7 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v6 @@ -85781,31 +85696,31 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v14.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v10.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v8.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v4.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v9.l +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.h, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.h, v11.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v10.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.h, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.h, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.h, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.h, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v1.l ; GFX11-TRUE16-NEXT: .LBB108_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4 @@ -85814,10 +85729,11 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v17, 0x40c00000, v17 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_lshlrev_b32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_add_f32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v17, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v17 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 @@ -85828,11 +85744,11 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_add3_u32 v21, v21, v2, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v20, v22, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v20, v22, vcc_lo ; GFX11-TRUE16-NEXT: v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_add_f32 v1, 0x40c00000, v1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v27.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v29.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v1, 16, 1 @@ -85845,188 +85761,181 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v18, 0x7fff ; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v26 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v20, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v20, v21, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v32.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v17, v22, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v18, v18, v4, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v4 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v19, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v18, v22, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v20, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v18, v22, vcc_lo ; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_lshlrev_b32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v19, 0x7fff ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v21, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v29, v17, v23 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v6, 0x40c00000, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_cndmask_b32 v27, v17, v23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v21, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21 ; GFX11-TRUE16-NEXT: v_add3_u32 v18, v18, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v30.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v18, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v35, v18, v19 :: v_dual_add_f32 v18, 0x40c00000, v22 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v20, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v32.h -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v4, v23 :: v_dual_add_f32 v18, 0x40c00000, v22 -; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v17, v29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v35.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v18, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v17, v27 ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v6, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v19, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v18, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v18 +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v19, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v19, v21, v18, 0x7fff ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v6, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v20, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v7 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v4 -; GFX11-TRUE16-NEXT: v_add3_u32 v19, v21, v18, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v17, v21, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v31, v19, v22 :: v_dual_and_b32 v20, 0xffff0000, v5 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_lshlrev_b32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v20, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v20, 0x7fff +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v3 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v28, v19, v22 :: v_dual_and_b32 v19, 0xffff0000, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v20 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v33.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v8, 0x40c00000, v8 :: v_dual_add_f32 v5, 0x40c00000, v5 ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v20, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v17, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v38, v17, v21 :: v_dual_add_f32 v19, 0x40c00000, v19 ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v8, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v36.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v6, v22, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v38.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v19, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v6, v22, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v6, v17, v8, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v8 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v19 ; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v19, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v6, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v37, v6, v17, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v7 -; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v18, v31 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v5, v22, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v5, v22, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v21, v20 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v10 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v35.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_add_f32 v7, 0x40c00000, v7 -; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v34 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 0x40c00000, v23 :: v_dual_add_f32 v20, 0x40c00000, v20 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v18, v28 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v17 ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v7, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v7 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v8 +; GFX11-TRUE16-NEXT: v_add3_u32 v18, v18, v17, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v7, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v20, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v6 +; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v30 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v49, v19, v21, vcc_lo -; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 0x40c00000, v23 :: v_dual_add_f32 v10, 0x40c00000, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v20, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v20 -; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v17 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v10, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add3_u32 v18, v18, v17, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v10, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v20, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v8 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v18, v22, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v10, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v10 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v49.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v5 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v39, v7, v21, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v12 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v48, v19, v22 :: v_dual_lshlrev_b32 v7, 16, v9 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v49.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v19, v22, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 0x40c00000, v10 :: v_dual_add_f32 v12, 0x40c00000, v12 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h -; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v21, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v19, v48 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 ; GFX11-TRUE16-NEXT: v_add3_u32 v22, v22, v21, 0x7fff -; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v9 ; GFX11-TRUE16-NEXT: v_add3_u32 v24, v24, v12, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v54, v22, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v19, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v50, v22, v36 :: v_dual_add_f32 v9, 0x40c00000, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v7 :: v_dual_lshlrev_b32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v48 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v7, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v7 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v9, 0x7fff +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v10 ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v7, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v11 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v54.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_lshlrev_b32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v50.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v65, v19, v25, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v9 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v14, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v52, v24, v50 :: v_dual_add_f32 v9, 0x40c00000, v23 ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v21, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v7, v52 -; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v9, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v14, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v24, v51, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v7, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v20, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v11 ; GFX11-TRUE16-NEXT: v_add3_u32 v11, v19, v21, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v9, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 8, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v20, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v12 ; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v65.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v12 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v19, vcc_lo ; GFX11-TRUE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v22 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) @@ -86038,26 +85947,26 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v23 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v71, v21, v22, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v67, v21, v22, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v21, v24, v19, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v19 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 ; GFX11-TRUE16-NEXT: v_add3_u32 v23, v25, v14, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v14 ; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 0x40c00000, v13 :: v_dual_cndmask_b32 v66, v21, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 0x40c00000, v13 :: v_dual_cndmask_b32 v48, v21, v22 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v14, v25, v7, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v16 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v68, v23, v24, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v64, v23, v24, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v13, 16, 1 -; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v20, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v65.h ; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v68.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v64.h ; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v14, v19 :: v_dual_add_f32 v14, 0x40c00000, v21 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v15 ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v23, v13, 0x7fff @@ -86067,42 +85976,42 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v16 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v86, v19, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v81, v19, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v13, v13, v16, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v21, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 ; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v14, 16, 1 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v21 ; GFX11-TRUE16-NEXT: v_add3_u32 v23, v23, v21, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v13, v25, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v70, v13, v25, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v24, v14, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v14 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v15, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v51, v15, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v15 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v23, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v71, v23, v52, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v86.h -; GFX11-TRUE16-NEXT: v_add3_u32 v13, v37, v15, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v71.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v22, v66 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v81, v19, v24, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v81.h +; GFX11-TRUE16-NEXT: v_add3_u32 v13, v51, v15, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v67.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v22, v48 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v53, v19, v24, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v82.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v85.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v70.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v71.h ; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v23, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 24, v14 +; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v20, v9 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v19, v81 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v9 +; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v19, v53 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v11 ; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v15, v13 ; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v21, v7 ; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v18, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] @@ -86111,14 +86020,16 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v7 ; GFX11-TRUE16-NEXT: .LBB108_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v32.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v113.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v29.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v112.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v26.h @@ -86128,13 +86039,13 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v103.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v35.h ; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v102.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.l, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v31.h ; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v101.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v24 @@ -86145,9 +86056,9 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v2.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v24 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v38.h ; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.h, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v27.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v99.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v100.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l @@ -86167,7 +86078,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v28.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v96.l ; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v7.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v24 @@ -86176,22 +86087,22 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v8.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v37.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v87.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v24 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v12 ; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v8.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v65.h ; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v84.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v86.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v82.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v83.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v85.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v30.h ; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v84.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v10.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v9.l @@ -86200,29 +86111,29 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v10.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v24 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v71.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v67.h ; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v70.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v83.l ; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v14, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v80.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 ; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v54.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v50.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v11.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v69.l ; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v86.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v81.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v14, v24 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v52.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v68.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v13.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v16, v24 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v19 @@ -86230,22 +86141,22 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v14.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v68.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v66.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v16, v24 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v19 ; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v14.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v85.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v71.h ; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v55.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v50.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v54.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v66.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v48.h ; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v82.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v52.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v16.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l @@ -86254,10 +86165,10 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v14.h, v16.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v51.l ; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v53.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 @@ -86275,107 +86186,107 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v17 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr18 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr17 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB108_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[7:8] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[28:29], 24, v[5:6] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 24, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 8, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 8, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 24, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 8, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 24, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 8, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 8, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 24, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 8, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 8, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 24, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 24, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 8, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 8, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 24, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 8, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 24, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 8, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[22:23], 24, v[11:12] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 24, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 8, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 8, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[30:31], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 8, v1 ; GFX11-FAKE16-NEXT: .LBB108_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB108_4 @@ -86408,222 +86319,227 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v19 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v2, v1, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v18, 0x40c00000, v21 :: v_dual_lshlrev_b32 v21, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v21 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v22, v17, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v18 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v20, v23, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v20, v18, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v21, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v19, v17, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v20, v20, v18, 0x7fff -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v2, v1, 0x7060302 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v20, v23, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_lshlrev_b32 v23, 16, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v18, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v18 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v19, v17, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v22, v22, v18, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v26, v23, 16, 1 ; GFX11-FAKE16-NEXT: v_add3_u32 v19, v19, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v20, v22, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v22, v24 :: v_dual_and_b32 v3, 0xffff0000, v3 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 24, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v26 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v19, v23 :: v_dual_lshlrev_b32 v23, 16, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 24, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v20 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v19, v25, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v19, v26, v23, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v19, v19, v24 :: v_dual_lshlrev_b32 v24, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v6 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-FAKE16-NEXT: v_add3_u32 v19, v24, v21, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v4, v18, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GFX11-FAKE16-NEXT: v_bfe_u32 v20, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v19, v19, v22 :: v_dual_lshlrev_b32 v22, 16, v5 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v23 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v20, v20, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v24, 0x40c00000, v24 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 0x40c00000, v25 :: v_dual_add_f32 v6, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v3 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v22, v22, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v29, v24, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v22, v26, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v23, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v4, v18, 0x7060302 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v18 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v20, v24, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v20, v21, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_bfe_u32 v25, v22, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v3, v19, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v3, v19, 0x7060302 ; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v20, v20, v21, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v28 +; GFX11-FAKE16-NEXT: v_add3_u32 v22, v22, v23, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v6, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v20, v20, v23 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v27, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v25, v22, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v24, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v8 -; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add3_u32 v21, v21, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v6, v23, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v24 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v29, v24, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v24 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v28, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v97, v3, v22, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v24, v6, v27 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v28 :: v_dual_lshlrev_b32 v27, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v7 -; GFX11-FAKE16-NEXT: v_perm_b32 v86, v3, v20, 0x7060302 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v21, v25 :: v_dual_add_f32 v8, 0x40c00000, v8 -; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v23, v23, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v23, v29 :: v_dual_and_b32 v8, 0xffff0000, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v6, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v3 -; GFX11-FAKE16-NEXT: v_perm_b32 v85, v5, v22, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v96, v5, v24, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v8, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v21, v21, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v85 +; GFX11-FAKE16-NEXT: v_add3_u32 v23, v23, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 16, v96 ; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v8, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v21, v24, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v23, v28, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v25, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v10 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v96 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v29, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX11-FAKE16-NEXT: v_perm_b32 v83, v5, v6, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v23, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v23 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v85, v5, v6, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v27, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v27 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v22 -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v30, v23, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v23, v8, v24 :: v_dual_lshlrev_b32 v24, 16, v9 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v24 +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v30, v27, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 24, v85 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v8, v28 :: v_dual_lshlrev_b32 v28, 16, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v29 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v24, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v28, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v8 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v7, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, 0x400000, v7 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v21, v21, v7, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v21, v30, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v23, v23, v7, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v23, v30, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v8, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, 0x400000, v10 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v82, v7, v23, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v84, v7, v27, 0x7060302 ; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v10, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v21, v21, v8, 0x7fff -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v82 +; GFX11-FAKE16-NEXT: v_add3_u32 v23, v23, v8, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 8, v84 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v10, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v21, v25, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v23, v29, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: v_add3_u32 v10, v31, v24, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v82 +; GFX11-FAKE16-NEXT: v_add3_u32 v10, v31, v28, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v28 ; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v30 :: v_dual_lshlrev_b32 v30, 16, v12 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v70, v7, v8, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; GFX11-FAKE16-NEXT: v_perm_b32 v81, v7, v8, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v10, v25, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v10, v29, vcc_lo ; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 8, v70 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 8, v81 ; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v12, 16, 1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add3_u32 v30, v30, v12, 0x7fff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v9, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v9 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-FAKE16-NEXT: v_add3_u32 v21, v21, v9, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v23, v23, v9, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v21, v31, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v11 -; GFX11-FAKE16-NEXT: v_bfe_u32 v25, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v23, v31, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v11 +; GFX11-FAKE16-NEXT: v_bfe_u32 v29, v10, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: v_perm_b32 v69, v9, v24, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v80, v9, v28, 0x7060302 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v21 -; GFX11-FAKE16-NEXT: v_add3_u32 v21, v25, v10, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v23 +; GFX11-FAKE16-NEXT: v_add3_u32 v23, v29, v10, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v10 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v12 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v9, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v69 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v21, v25, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v80 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v23, v29, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v9 -; GFX11-FAKE16-NEXT: v_add3_u32 v21, v32, v9, 0x7fff -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 8, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_add3_u32 v23, v32, v9, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 8, v80 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v30, v31, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v14 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GFX11-FAKE16-NEXT: v_perm_b32 v55, v12, v10, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v21, v25, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v30 +; GFX11-FAKE16-NEXT: v_perm_b32 v65, v12, v10, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v23, v29, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v30 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v11, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, 0x400000, v11 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v21, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 8, v55 -; GFX11-FAKE16-NEXT: v_add3_u32 v25, v31, v11, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v23, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 8, v65 +; GFX11-FAKE16-NEXT: v_add3_u32 v29, v31, v11, 0x7fff ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v13 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v25, v30, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v25, 0x40c00000, v31 -; GFX11-FAKE16-NEXT: v_add3_u32 v30, v32, v21, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v21 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v29, v30, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v31 +; GFX11-FAKE16-NEXT: v_add3_u32 v30, v32, v23, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v23 ; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v14, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v30, v31, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v30, v31, vcc_lo ; GFX11-FAKE16-NEXT: v_add3_u32 v30, v32, v14, 0x7fff ; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v14 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v33, v25, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v33, v29, 0x7fff ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v16 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v30, v31, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v33 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX11-FAKE16-NEXT: v_perm_b32 v54, v11, v9, 0x7060302 -; GFX11-FAKE16-NEXT: v_perm_b32 v50, v14, v21, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v25, v32, v34 :: v_dual_lshlrev_b32 v34, 16, v15 +; GFX11-FAKE16-NEXT: v_perm_b32 v64, v11, v9, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v52, v14, v23, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v29, v32, v34 :: v_dual_lshlrev_b32 v34, 16, v15 ; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v13, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, 0x400000, v13 ; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v30, 16, 1 @@ -86631,8 +86547,8 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX11-FAKE16-NEXT: v_add3_u32 v31, v35, v13, 0x7fff ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v50 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[84:85] ; GFX11-FAKE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v31, v32, vcc_lo ; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v34 @@ -86641,191 +86557,190 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v16, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 ; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v31, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v15, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v31 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v15 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc_lo +; GFX11-FAKE16-NEXT: v_perm_b32 v51, v13, v29, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc_lo ; GFX11-FAKE16-NEXT: v_add3_u32 v33, v34, v16, 0x7fff ; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v16 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v15, 16, 1 ; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v31, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v30, v30, v15, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v49, v13, v25, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[28:29], 24, v[96:97] ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v15, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v23 +; GFX11-FAKE16-NEXT: v_perm_b32 v39, v16, v30, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v35, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v10 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v35, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v49 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v54 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v30, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_perm_b32 v37, v16, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v32 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v36, v15, v31, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v32, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[22:23], 24, v[64:65] +; GFX11-FAKE16-NEXT: v_perm_b32 v38, v15, v31, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[25:26] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 24, v37 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[36:37] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[49:50] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[54:55] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[20:21], 24, v[69:70] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[21:22], 24, v[82:83] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[22:23], 24, v[85:86] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[28:29] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[26:27] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 8, v37 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v36 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 8, v36 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 24, v50 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 8, v49 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 24, v55 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 8, v54 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 24, v70 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 24, v83 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v83 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 24, v86 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 8, v86 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 8, v85 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 24, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 8, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 8, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 8, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[80:81] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[30:31], 24, v[20:21] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[38:39] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[51:52] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 24, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 8, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 8, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 24, v52 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 8, v52 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v51 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 8, v51 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 24, v65 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v64 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 8, v64 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 24, v81 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v85 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v84 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 24, v97 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 8, v97 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 24, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 8, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 8, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 8, v20 ; GFX11-FAKE16-NEXT: .LBB108_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v97 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v30 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v81 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v96 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v20, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v83 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v98 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v97 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v26, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v81, 0xff, v87 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v29 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v86 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v21, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v30, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v96 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v68 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v84 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v87 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v86 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v28 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v65 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v68, 8, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v81, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v28, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v31, 8, v84 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v28 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v65, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v30, v31 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v25 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v71 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v81 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v27 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v70 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v52 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v54 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v71 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v67 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v23, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v20, v21 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v25, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v25, v26 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v66 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v68 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v23 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v64 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v38 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v55 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v48 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v65 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v54 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v53 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v55 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v23 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v24 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v25, v26 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v28, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v28, v22 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v21 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v22 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v51 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v33 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v50 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v52 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v49 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v48 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v50 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v49 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v19 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v20, v21 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v22 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v23, v18 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v37 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v32 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v32 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v34 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v22 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v23, v17 @@ -86895,21 +86810,21 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_mul_f32_e32 v42, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v20, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v25, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v43, 1.0, v6 ; SI-NEXT: v_mul_f32_e32 v23, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v31, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v7 ; SI-NEXT: v_mul_f32_e32 v44, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v9 ; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_mul_f32_e32 v35, 1.0, v11 ; SI-NEXT: v_mul_f32_e32 v56, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v16 ; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v17 ; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v22, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 @@ -86923,25 +86838,19 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_mul_f32_e64 v7, 1.0, s26 ; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 ; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 @@ -86997,213 +86906,219 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 -; SI-NEXT: v_alignbit_b32 v6, v1, v28, 16 +; SI-NEXT: v_alignbit_b32 v6, v1, v29, 16 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v31 -; SI-NEXT: v_alignbit_b32 v3, v1, v34, 16 +; SI-NEXT: v_alignbit_b32 v3, v1, v32, 16 +; SI-NEXT: v_mov_b32_e32 v48, v12 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v44 ; SI-NEXT: v_alignbit_b32 v2, v1, v35, 16 -; SI-NEXT: v_alignbit_b32 v8, v7, v33, 16 +; SI-NEXT: v_mov_b32_e32 v51, v14 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; SI-NEXT: v_alignbit_b32 v14, v12, v28, 16 +; SI-NEXT: v_alignbit_b32 v4, v14, v3, 24 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v56 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v4, v14, v3, 16 +; SI-NEXT: v_alignbit_b32 v8, v7, v38, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v4, v8, v2, 24 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v36 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_alignbit_b32 v4, v8, v2, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v4, v8, v2, 8 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v33 ; SI-NEXT: v_alignbit_b32 v1, v1, v39, 16 ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v44 -; SI-NEXT: v_alignbit_b32 v5, v4, v32, 16 -; SI-NEXT: v_mov_b32_e32 v31, v23 +; SI-NEXT: v_alignbit_b32 v5, v4, v45, 16 +; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: v_mov_b32_e32 v32, v23 ; SI-NEXT: v_alignbit_b32 v20, v18, v23, 16 -; SI-NEXT: v_alignbit_b32 v14, v12, v29, 16 ; SI-NEXT: v_alignbit_b32 v23, v5, v1, 24 -; SI-NEXT: v_mov_b32_e32 v38, v36 -; SI-NEXT: v_alignbit_b32 v36, v20, v6, 24 -; SI-NEXT: v_alignbit_b32 v25, v14, v3, 24 -; SI-NEXT: v_alignbit_b32 v50, v8, v2, 16 -; SI-NEXT: v_mov_b32_e32 v53, v32 ; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v23, v5, v1, 16 -; SI-NEXT: v_alignbit_b32 v32, v5, v1, 8 +; SI-NEXT: v_mov_b32_e32 v34, v29 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v23, v5, v1, 8 +; SI-NEXT: v_mov_b32_e32 v29, v26 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v26 +; SI-NEXT: v_mov_b32_e32 v26, v42 +; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v42 +; SI-NEXT: v_mov_b32_e32 v42, v33 +; SI-NEXT: v_mov_b32_e32 v36, v31 +; SI-NEXT: v_mov_b32_e32 v49, v35 +; SI-NEXT: v_mov_b32_e32 v52, v39 +; SI-NEXT: v_alignbit_b32 v53, v20, v6, 24 ; SI-NEXT: v_alignbit_b32 v55, v20, v6, 16 ; SI-NEXT: v_alignbit_b32 v40, v20, v6, 8 -; SI-NEXT: v_mov_b32_e32 v35, v29 -; SI-NEXT: v_alignbit_b32 v52, v14, v3, 16 +; SI-NEXT: v_mov_b32_e32 v35, v28 ; SI-NEXT: v_alignbit_b32 v54, v14, v3, 8 -; SI-NEXT: v_mov_b32_e32 v37, v33 -; SI-NEXT: v_alignbit_b32 v51, v8, v2, 8 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_mov_b32_e32 v50, v45 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, v22 ; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v22 ; SI-NEXT: v_lshrrev_b32_e32 v62, 8, v30 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v23, v41 ; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v41 ; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v19 -; SI-NEXT: v_mov_b32_e32 v28, v26 -; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v26 ; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v16 -; SI-NEXT: v_mov_b32_e32 v26, v42 -; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v42 ; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v11 -; SI-NEXT: v_mov_b32_e32 v29, v43 +; SI-NEXT: v_mov_b32_e32 v31, v43 ; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v43 ; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v20 -; SI-NEXT: v_mov_b32_e32 v34, v44 +; SI-NEXT: v_mov_b32_e32 v28, v44 ; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v44 ; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v14 -; SI-NEXT: v_mov_b32_e32 v33, v56 +; SI-NEXT: v_mov_b32_e32 v38, v56 ; SI-NEXT: v_lshrrev_b32_e32 v43, 24, v56 ; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v8 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v48 -; SI-NEXT: v_mov_b32_e32 v48, v32 -; SI-NEXT: v_mov_b32_e32 v32, v50 -; SI-NEXT: v_mov_b32_e32 v50, v25 -; SI-NEXT: v_mov_b32_e32 v25, v36 -; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v42 ; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v5 ; SI-NEXT: s_cbranch_execnz .LBB109_3 ; SI-NEXT: .LBB109_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v36 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v39 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v44, 0x40c00000, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v48 ; SI-NEXT: v_alignbit_b32 v5, v4, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v43 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v34 -; SI-NEXT: v_add_f32_e32 v44, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v44 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v45, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v45 -; SI-NEXT: v_alignbit_b32 v48, v5, v1, 8 -; SI-NEXT: v_lshrrev_b32_e32 v43, 24, v43 -; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v42 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 ; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v43 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v36 ; SI-NEXT: v_alignbit_b32 v8, v7, v3, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v32, v8, v2, 16 -; SI-NEXT: v_alignbit_b32 v51, v8, v2, 8 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_alignbit_b32 v3, v6, v3, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v44 +; SI-NEXT: v_add_f32_e32 v45, 0x40c00000, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v14, v12, v6, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v45 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v54, v14, v3, 8 +; SI-NEXT: v_lshrrev_b32_e32 v43, 24, v43 +; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v42 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v14 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_alignbit_b32 v6, v9, v6, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_alignbit_b32 v20, v18, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v53, v20, v6, 24 +; SI-NEXT: v_alignbit_b32 v55, v20, v6, 16 +; SI-NEXT: v_alignbit_b32 v40, v20, v6, 8 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_alignbit_b32 v15, v15, v13, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v21, v19, v17, 16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_alignbit_b32 v15, v15, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v3, v6, v3, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v14, v12, v6, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v50, v14, v3, 24 -; SI-NEXT: v_alignbit_b32 v52, v14, v3, 16 -; SI-NEXT: v_alignbit_b32 v54, v14, v3, 8 -; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v14 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_alignbit_b32 v10, v10, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v13 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v17 -; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v56 ; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v41 -; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v56 -; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v8 ; SI-NEXT: v_alignbit_b32 v19, v17, v19, 16 -; SI-NEXT: v_alignbit_b32 v16, v13, v16, 16 ; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v19 -; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v6, v9, v6, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v20, v18, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v25, v20, v6, 24 -; SI-NEXT: v_alignbit_b32 v55, v20, v6, 16 -; SI-NEXT: v_alignbit_b32 v40, v20, v6, 8 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v47, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v47 +; SI-NEXT: v_alignbit_b32 v11, v9, v11, 16 +; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v11 +; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v47 +; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v20 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v10, v10, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v59, 0x40c00000, v23 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v29 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v59 +; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v56 +; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v8 +; SI-NEXT: v_alignbit_b32 v16, v13, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; SI-NEXT: v_alignbit_b32 v30, v24, v22, 16 ; SI-NEXT: v_alignbit_b32 v22, v30, v27, 24 ; SI-NEXT: v_lshrrev_b32_e32 v62, 8, v30 @@ -87232,18 +87147,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v22, v16, v15, 8 ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v47, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v47 -; SI-NEXT: v_alignbit_b32 v11, v9, v11, 16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v22, v11, v10, 24 -; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v11 -; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v47 -; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v20 ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v22, v11, v10, 16 @@ -87252,15 +87157,30 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_alignbit_b32 v22, v11, v10, 8 ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v14, v3, 24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v14, v3, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v22, v8, v2, 24 ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v8, v2, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v8, v2, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v22, v5, v1, 24 ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v22, v5, v1, 16 ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v5, v1, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v59 ; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v45 ; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v44 @@ -87393,7 +87313,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_or_b32_e32 v6, v6, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v55 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v53 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 @@ -87414,10 +87334,14 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v54 ; SI-NEXT: v_or_b32_e32 v3, v3, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v52 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v50 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; SI-NEXT: v_or_b32_e32 v6, v9, v6 ; SI-NEXT: v_or_b32_e32 v3, v3, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, 40, v0 @@ -87434,15 +87358,19 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_or_b32_e32 v3, v3, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v51 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v6, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 @@ -87460,16 +87388,18 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 @@ -87508,39 +87438,56 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: v_mov_b32_e32 v53, v32 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: v_mov_b32_e32 v52, v39 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: v_mov_b32_e32 v51, v14 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_mov_b32_e32 v50, v45 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v37, v33 +; SI-NEXT: v_mov_b32_e32 v49, v35 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v33, v56 +; SI-NEXT: v_mov_b32_e32 v48, v12 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v35, v29 +; SI-NEXT: v_mov_b32_e32 v39, v38 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v34, v44 +; SI-NEXT: v_mov_b32_e32 v38, v56 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v31, v23 +; SI-NEXT: v_mov_b32_e32 v37, v32 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v29, v43 +; SI-NEXT: v_mov_b32_e32 v36, v31 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v28, v26 +; SI-NEXT: v_mov_b32_e32 v35, v28 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v26, v42 +; SI-NEXT: v_mov_b32_e32 v28, v44 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v23, v41 +; SI-NEXT: v_mov_b32_e32 v34, v29 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: v_mov_b32_e32 v32, v23 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: v_mov_b32_e32 v31, v43 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: v_mov_b32_e32 v29, v26 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_mov_b32_e32 v26, v42 +; SI-NEXT: v_mov_b32_e32 v23, v41 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v25, v22 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr4 @@ -87568,33 +87515,28 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: s_branch .LBB109_2 ; @@ -88370,27 +88312,27 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX9-NEXT: s_lshr_b32 s60, s22, 8 ; GFX9-NEXT: s_lshr_b32 s48, s21, 24 ; GFX9-NEXT: s_lshr_b32 s78, s21, 16 -; GFX9-NEXT: s_lshr_b32 s59, s21, 8 +; GFX9-NEXT: s_lshr_b32 s45, s21, 8 ; GFX9-NEXT: s_lshr_b32 s49, s20, 16 -; GFX9-NEXT: s_lshr_b32 s58, s20, 8 +; GFX9-NEXT: s_lshr_b32 s44, s20, 8 ; GFX9-NEXT: s_lshr_b32 s50, s19, 24 ; GFX9-NEXT: s_lshr_b32 s77, s19, 16 -; GFX9-NEXT: s_lshr_b32 s57, s19, 8 +; GFX9-NEXT: s_lshr_b32 s13, s19, 8 ; GFX9-NEXT: s_lshr_b32 s51, s18, 16 -; GFX9-NEXT: s_lshr_b32 s56, s18, 8 +; GFX9-NEXT: s_lshr_b32 s12, s18, 8 ; GFX9-NEXT: s_lshr_b32 s52, s17, 24 ; GFX9-NEXT: s_lshr_b32 s76, s17, 16 ; GFX9-NEXT: s_lshr_b32 s53, s17, 8 ; GFX9-NEXT: s_lshr_b32 s54, s16, 16 ; GFX9-NEXT: s_lshr_b32 s55, s16, 8 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 -; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[14:15], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[46:47], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[56:57], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[58:59], s[16:17], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB109_3 ; GFX9-NEXT: .LBB109_2: ; %cmp.true ; GFX9-NEXT: s_and_b32 s6, s17, 0xffff0000 @@ -88419,357 +88361,357 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX9-NEXT: s_and_b32 s6, s16, 0xffff0000 ; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 ; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_bfe_u32 s8, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s8, s8, s6 +; GFX9-NEXT: s_add_i32 s10, s8, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s6, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s16, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s6, s10 +; GFX9-NEXT: s_lshl_b32 s8, s16, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX9-NEXT: s_add_i32 s9, s9, s8 +; GFX9-NEXT: s_lshr_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff +; GFX9-NEXT: s_or_b32 s11, s8, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s10, s9 -; GFX9-NEXT: s_lshr_b32 s16, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s19, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s46, s16, s8 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s8, s11, s10 +; GFX9-NEXT: s_lshr_b32 s16, s8, 16 +; GFX9-NEXT: s_and_b32 s8, s19, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX9-NEXT: s_add_i32 s9, s9, s8 +; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff +; GFX9-NEXT: s_or_b32 s11, s8, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s77, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s19, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s8, s11, s10 +; GFX9-NEXT: s_lshr_b32 s77, s8, 16 +; GFX9-NEXT: s_lshl_b32 s8, s19, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX9-NEXT: s_add_i32 s9, s9, s8 +; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff +; GFX9-NEXT: s_or_b32 s11, s8, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s19, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s18, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s8, s11, s10 +; GFX9-NEXT: s_lshr_b32 s19, s8, 16 +; GFX9-NEXT: s_and_b32 s8, s18, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX9-NEXT: s_add_i32 s9, s9, s8 +; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff +; GFX9-NEXT: s_or_b32 s11, s8, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s18, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s8, s11, s10 +; GFX9-NEXT: s_lshr_b32 s10, s8, 16 +; GFX9-NEXT: s_lshl_b32 s8, s18, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX9-NEXT: s_add_i32 s9, s9, s8 +; GFX9-NEXT: s_add_i32 s11, s9, 0x7fff +; GFX9-NEXT: s_or_b32 s12, s8, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s10, s9 -; GFX9-NEXT: s_lshr_b32 s18, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s21, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s56, s18, s8 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s8, s12, s11 +; GFX9-NEXT: s_lshr_b32 s18, s8, 16 +; GFX9-NEXT: s_and_b32 s8, s21, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX9-NEXT: s_add_i32 s9, s9, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s18, s10 +; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff +; GFX9-NEXT: s_or_b32 s11, s8, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s78, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s21, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s8, s11, s10 +; GFX9-NEXT: s_lshr_b32 s78, s8, 16 +; GFX9-NEXT: s_lshl_b32 s8, s21, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX9-NEXT: s_add_i32 s9, s9, s8 +; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff +; GFX9-NEXT: s_or_b32 s11, s8, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s21, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s20, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s8, s11, s10 +; GFX9-NEXT: s_lshr_b32 s21, s8, 16 +; GFX9-NEXT: s_and_b32 s8, s20, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX9-NEXT: s_add_i32 s9, s9, s8 +; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff +; GFX9-NEXT: s_or_b32 s11, s8, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s20, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s8, s11, s10 +; GFX9-NEXT: s_lshr_b32 s10, s8, 16 +; GFX9-NEXT: s_lshl_b32 s8, s20, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX9-NEXT: s_add_i32 s9, s9, s8 +; GFX9-NEXT: s_add_i32 s11, s9, 0x7fff +; GFX9-NEXT: s_or_b32 s14, s8, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s10, s9 -; GFX9-NEXT: s_lshr_b32 s20, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s23, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s58, s20, s8 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s8, s14, s11 +; GFX9-NEXT: s_lshr_b32 s20, s8, 16 +; GFX9-NEXT: s_and_b32 s8, s23, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX9-NEXT: s_add_i32 s9, s9, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s44, s20, s10 +; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff +; GFX9-NEXT: s_or_b32 s11, s8, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s79, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s23, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s8, s11, s10 +; GFX9-NEXT: s_lshr_b32 s79, s8, 16 +; GFX9-NEXT: s_lshl_b32 s8, s23, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX9-NEXT: s_add_i32 s9, s9, s8 +; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff +; GFX9-NEXT: s_or_b32 s11, s8, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s23, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s22, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s8, s11, s10 +; GFX9-NEXT: s_lshr_b32 s23, s8, 16 +; GFX9-NEXT: s_and_b32 s8, s22, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX9-NEXT: s_add_i32 s9, s9, s8 +; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff +; GFX9-NEXT: s_or_b32 s11, s8, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s22, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s8, s11, s10 +; GFX9-NEXT: s_lshr_b32 s10, s8, 16 +; GFX9-NEXT: s_lshl_b32 s8, s22, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX9-NEXT: s_add_i32 s9, s9, s8 +; GFX9-NEXT: s_add_i32 s11, s9, 0x7fff +; GFX9-NEXT: s_or_b32 s14, s8, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s10, s9 -; GFX9-NEXT: s_lshr_b32 s22, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s25, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s60, s22, s8 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s8, s14, s11 +; GFX9-NEXT: s_lshr_b32 s22, s8, 16 +; GFX9-NEXT: s_and_b32 s8, s25, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX9-NEXT: s_add_i32 s9, s9, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s60, s22, s10 +; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff +; GFX9-NEXT: s_or_b32 s11, s8, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s88, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s25, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s8, s11, s10 +; GFX9-NEXT: s_lshr_b32 s88, s8, 16 +; GFX9-NEXT: s_lshl_b32 s8, s25, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX9-NEXT: s_add_i32 s9, s9, s8 +; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff +; GFX9-NEXT: s_or_b32 s11, s8, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s25, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s24, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s8, s11, s10 +; GFX9-NEXT: s_lshr_b32 s25, s8, 16 +; GFX9-NEXT: s_and_b32 s8, s24, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX9-NEXT: s_add_i32 s9, s9, s8 +; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff +; GFX9-NEXT: s_or_b32 s11, s8, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s24, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s8, s11, s10 +; GFX9-NEXT: s_lshr_b32 s10, s8, 16 +; GFX9-NEXT: s_lshl_b32 s8, s24, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX9-NEXT: s_add_i32 s9, s9, s8 +; GFX9-NEXT: s_add_i32 s11, s9, 0x7fff +; GFX9-NEXT: s_or_b32 s14, s8, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s10, s9 -; GFX9-NEXT: s_lshr_b32 s24, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s27, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s62, s24, s8 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s8, s14, s11 +; GFX9-NEXT: s_lshr_b32 s24, s8, 16 +; GFX9-NEXT: s_and_b32 s8, s27, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX9-NEXT: s_add_i32 s9, s9, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s62, s24, s10 +; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff +; GFX9-NEXT: s_or_b32 s11, s8, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s89, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s27, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s8, s11, s10 +; GFX9-NEXT: s_lshr_b32 s89, s8, 16 +; GFX9-NEXT: s_lshl_b32 s8, s27, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX9-NEXT: s_add_i32 s9, s9, s8 +; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff +; GFX9-NEXT: s_or_b32 s11, s8, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s27, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s26, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s8, s11, s10 +; GFX9-NEXT: s_lshr_b32 s27, s8, 16 +; GFX9-NEXT: s_and_b32 s8, s26, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX9-NEXT: s_add_i32 s9, s9, s8 +; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff +; GFX9-NEXT: s_or_b32 s11, s8, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s26, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s8, s11, s10 +; GFX9-NEXT: s_lshr_b32 s10, s8, 16 +; GFX9-NEXT: s_lshl_b32 s8, s26, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX9-NEXT: s_add_i32 s9, s9, s8 +; GFX9-NEXT: s_add_i32 s11, s9, 0x7fff +; GFX9-NEXT: s_or_b32 s14, s8, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s10, s9 -; GFX9-NEXT: s_lshr_b32 s26, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s29, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s72, s26, s8 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s8, s14, s11 +; GFX9-NEXT: s_lshr_b32 s26, s8, 16 +; GFX9-NEXT: s_and_b32 s8, s29, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX9-NEXT: s_add_i32 s9, s9, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s72, s26, s10 +; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff +; GFX9-NEXT: s_or_b32 s11, s8, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s90, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s29, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s8, s11, s10 +; GFX9-NEXT: s_lshr_b32 s90, s8, 16 +; GFX9-NEXT: s_lshl_b32 s8, s29, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX9-NEXT: s_add_i32 s9, s9, s8 +; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff +; GFX9-NEXT: s_or_b32 s11, s8, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s29, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s28, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s8, s11, s10 +; GFX9-NEXT: s_lshr_b32 s29, s8, 16 +; GFX9-NEXT: s_and_b32 s8, s28, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX9-NEXT: s_add_i32 s9, s9, s8 +; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff +; GFX9-NEXT: s_or_b32 s11, s8, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_lshl_b32 s6, s28, 16 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s8, s11, s10 +; GFX9-NEXT: s_lshr_b32 s10, s8, 16 +; GFX9-NEXT: s_lshl_b32 s8, s28, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX9-NEXT: s_add_i32 s9, s9, s8 +; GFX9-NEXT: s_add_i32 s11, s9, 0x7fff +; GFX9-NEXT: s_or_b32 s14, s8, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s10, s9 -; GFX9-NEXT: s_lshr_b32 s28, s6, 16 -; GFX9-NEXT: s_and_b32 s6, s5, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s74, s28, s8 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s8, s14, s11 +; GFX9-NEXT: s_lshr_b32 s28, s8, 16 +; GFX9-NEXT: s_and_b32 s8, s5, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX9-NEXT: s_add_i32 s9, s9, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s74, s28, s10 +; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff +; GFX9-NEXT: s_or_b32 s11, s8, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s8, s11, s10 ; GFX9-NEXT: s_lshl_b32 s5, s5, 16 ; GFX9-NEXT: v_add_f32_e32 v2, s5, v1 ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 -; GFX9-NEXT: s_lshr_b32 s91, s6, 16 -; GFX9-NEXT: s_bfe_u32 s6, s5, 0x10010 -; GFX9-NEXT: s_add_i32 s6, s6, s5 -; GFX9-NEXT: s_add_i32 s8, s6, 0x7fff +; GFX9-NEXT: s_lshr_b32 s91, s8, 16 +; GFX9-NEXT: s_bfe_u32 s8, s5, 0x10010 +; GFX9-NEXT: s_add_i32 s8, s8, s5 +; GFX9-NEXT: s_add_i32 s10, s8, 0x7fff ; GFX9-NEXT: s_bitset1_b32 s5, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s5, s5, s8 -; GFX9-NEXT: s_and_b32 s6, s4, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 -; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s5, s5, s10 +; GFX9-NEXT: s_and_b32 s8, s4, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX9-NEXT: s_add_i32 s9, s9, s8 ; GFX9-NEXT: s_lshr_b32 s5, s5, 16 -; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff -; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff +; GFX9-NEXT: s_or_b32 s11, s8, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s8, s11, s10 ; GFX9-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_bfe_u32 s6, s4, 0x10010 -; GFX9-NEXT: s_add_i32 s6, s6, s4 -; GFX9-NEXT: s_add_i32 s9, s6, 0x7fff +; GFX9-NEXT: s_lshr_b32 s10, s8, 16 +; GFX9-NEXT: s_bfe_u32 s8, s4, 0x10010 +; GFX9-NEXT: s_add_i32 s8, s8, s4 +; GFX9-NEXT: s_add_i32 s11, s8, 0x7fff ; GFX9-NEXT: s_bitset1_b32 s4, 22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s4, s4, s9 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s4, s4, s11 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s47, s17, s76 -; GFX9-NEXT: s_pack_ll_b32_b16 s57, s19, s77 -; GFX9-NEXT: s_pack_ll_b32_b16 s59, s21, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s19, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s45, s21, s78 ; GFX9-NEXT: s_pack_ll_b32_b16 s61, s23, s79 ; GFX9-NEXT: s_pack_ll_b32_b16 s63, s25, s88 ; GFX9-NEXT: s_pack_ll_b32_b16 s73, s27, s89 ; GFX9-NEXT: s_pack_ll_b32_b16 s75, s29, s90 ; GFX9-NEXT: s_pack_ll_b32_b16 s31, s5, s91 -; GFX9-NEXT: s_pack_ll_b32_b16 s30, s4, s8 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[30:31], 24 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[74:75], 24 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[72:73], 24 -; GFX9-NEXT: s_lshr_b64 s[12:13], s[62:63], 24 -; GFX9-NEXT: s_lshr_b64 s[14:15], s[60:61], 24 -; GFX9-NEXT: s_lshr_b64 s[40:41], s[58:59], 24 -; GFX9-NEXT: s_lshr_b64 s[42:43], s[56:57], 24 -; GFX9-NEXT: s_lshr_b64 s[44:45], s[46:47], 24 +; GFX9-NEXT: s_pack_ll_b32_b16 s30, s4, s10 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[30:31], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[74:75], 24 +; GFX9-NEXT: s_lshr_b64 s[14:15], s[72:73], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[62:63], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[60:61], 24 +; GFX9-NEXT: s_lshr_b64 s[46:47], s[44:45], 24 +; GFX9-NEXT: s_lshr_b64 s[56:57], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[58:59], s[6:7], 24 ; GFX9-NEXT: s_lshr_b32 s92, s31, 24 ; GFX9-NEXT: s_lshr_b32 s93, s31, 8 ; GFX9-NEXT: s_lshr_b32 s94, s30, 16 @@ -88789,179 +88731,179 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX9-NEXT: s_lshr_b32 s38, s61, 24 ; GFX9-NEXT: s_lshr_b32 s61, s61, 8 ; GFX9-NEXT: s_lshr_b32 s39, s60, 16 -; GFX9-NEXT: s_lshr_b32 s60, s60, 8 -; GFX9-NEXT: s_lshr_b32 s48, s59, 24 -; GFX9-NEXT: s_lshr_b32 s59, s59, 8 -; GFX9-NEXT: s_lshr_b32 s49, s58, 16 -; GFX9-NEXT: s_lshr_b32 s58, s58, 8 -; GFX9-NEXT: s_lshr_b32 s50, s57, 24 -; GFX9-NEXT: s_lshr_b32 s57, s57, 8 -; GFX9-NEXT: s_lshr_b32 s51, s56, 16 -; GFX9-NEXT: s_lshr_b32 s56, s56, 8 -; GFX9-NEXT: s_lshr_b32 s52, s47, 24 -; GFX9-NEXT: s_lshr_b32 s53, s47, 8 -; GFX9-NEXT: s_lshr_b32 s54, s46, 16 -; GFX9-NEXT: s_lshr_b32 s55, s46, 8 -; GFX9-NEXT: .LBB109_3: ; %end -; GFX9-NEXT: s_and_b32 s7, s16, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s55, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s54, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s44, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s17, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s53, 8 +; GFX9-NEXT: s_lshr_b32 s60, s60, 8 +; GFX9-NEXT: s_lshr_b32 s48, s45, 24 +; GFX9-NEXT: s_lshr_b32 s45, s45, 8 +; GFX9-NEXT: s_lshr_b32 s49, s44, 16 +; GFX9-NEXT: s_lshr_b32 s44, s44, 8 +; GFX9-NEXT: s_lshr_b32 s50, s13, 24 +; GFX9-NEXT: s_lshr_b32 s13, s13, 8 +; GFX9-NEXT: s_lshr_b32 s51, s12, 16 +; GFX9-NEXT: s_lshr_b32 s12, s12, 8 +; GFX9-NEXT: s_lshr_b32 s52, s7, 24 +; GFX9-NEXT: s_lshr_b32 s53, s7, 8 +; GFX9-NEXT: s_lshr_b32 s54, s6, 16 +; GFX9-NEXT: s_lshr_b32 s55, s6, 8 +; GFX9-NEXT: .LBB109_3: ; %end +; GFX9-NEXT: s_and_b32 s6, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s55, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s54, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s58, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s76, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s52, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_and_b32 s6, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s53, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s76, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s52, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s18, 0xff +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s12, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s51, 0xff ; GFX9-NEXT: s_lshl_b32 s9, s56, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s51, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s42, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s19, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s57, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s77, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s50, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_and_b32 s6, s19, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s13, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s77, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s50, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s20, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s58, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s49, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s40, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s44, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s49, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s46, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s21, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s59, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s78, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s48, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_and_b32 s6, s21, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s45, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s78, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s48, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s22, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s60, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s39, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s14, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s60, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s39, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s42, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s23, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s61, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s79, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s38, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_and_b32 s6, s23, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s61, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s79, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s38, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s24, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s62, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s37, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s12, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s62, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s37, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s40, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s25, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s63, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s88, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s36, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_and_b32 s6, s25, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s63, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s88, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s36, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s26, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s72, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s35, 0xff -; GFX9-NEXT: s_lshl_b32 s10, s10, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s10 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_and_b32 s6, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s72, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s35, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s14, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s27, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s73, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s89, 0xff -; GFX9-NEXT: s_lshl_b32 s10, s34, 8 -; GFX9-NEXT: s_or_b32 s9, s9, s10 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_and_b32 s6, s27, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s73, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s89, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s34, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s28, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s74, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_and_b32 s6, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s74, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s31, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s9, s31, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s8, 8 -; GFX9-NEXT: s_or_b32 s8, s9, s8 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s8, s8, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s7, s29, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s75, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_and_b32 s8, s90, 0xff +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_and_b32 s6, s29, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s75, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s90, 0xff ; GFX9-NEXT: s_lshl_b32 s9, s30, 8 -; GFX9-NEXT: s_or_b32 s8, s8, s9 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s8, s8, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: s_and_b32 s4, s4, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s95, 8 -; GFX9-NEXT: s_or_b32 s4, s4, s7 -; GFX9-NEXT: s_and_b32 s7, s94, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_lshl_b32 s6, s95, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s94, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s8, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: s_or_b32 s4, s4, s6 @@ -89003,49 +88945,49 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX9-NEXT: .LBB109_4: ; GFX9-NEXT: ; implicit-def: $sgpr55 ; GFX9-NEXT: ; implicit-def: $sgpr54 -; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr58 ; GFX9-NEXT: ; implicit-def: $sgpr53 ; GFX9-NEXT: ; implicit-def: $sgpr76 ; GFX9-NEXT: ; implicit-def: $sgpr52 -; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr12 ; GFX9-NEXT: ; implicit-def: $sgpr51 -; GFX9-NEXT: ; implicit-def: $sgpr42 -; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr13 ; GFX9-NEXT: ; implicit-def: $sgpr77 ; GFX9-NEXT: ; implicit-def: $sgpr50 -; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr44 ; GFX9-NEXT: ; implicit-def: $sgpr49 -; GFX9-NEXT: ; implicit-def: $sgpr40 -; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 ; GFX9-NEXT: ; implicit-def: $sgpr78 ; GFX9-NEXT: ; implicit-def: $sgpr48 ; GFX9-NEXT: ; implicit-def: $sgpr60 ; GFX9-NEXT: ; implicit-def: $sgpr39 -; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr42 ; GFX9-NEXT: ; implicit-def: $sgpr61 ; GFX9-NEXT: ; implicit-def: $sgpr79 ; GFX9-NEXT: ; implicit-def: $sgpr38 ; GFX9-NEXT: ; implicit-def: $sgpr62 ; GFX9-NEXT: ; implicit-def: $sgpr37 -; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr40 ; GFX9-NEXT: ; implicit-def: $sgpr63 ; GFX9-NEXT: ; implicit-def: $sgpr88 ; GFX9-NEXT: ; implicit-def: $sgpr36 ; GFX9-NEXT: ; implicit-def: $sgpr72 ; GFX9-NEXT: ; implicit-def: $sgpr35 -; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr14 ; GFX9-NEXT: ; implicit-def: $sgpr73 ; GFX9-NEXT: ; implicit-def: $sgpr89 ; GFX9-NEXT: ; implicit-def: $sgpr34 ; GFX9-NEXT: ; implicit-def: $sgpr74 ; GFX9-NEXT: ; implicit-def: $sgpr31 -; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr10 ; GFX9-NEXT: ; implicit-def: $sgpr75 ; GFX9-NEXT: ; implicit-def: $sgpr90 ; GFX9-NEXT: ; implicit-def: $sgpr30 ; GFX9-NEXT: ; implicit-def: $sgpr95 ; GFX9-NEXT: ; implicit-def: $sgpr94 -; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr8 ; GFX9-NEXT: ; implicit-def: $sgpr93 ; GFX9-NEXT: ; implicit-def: $sgpr91 ; GFX9-NEXT: ; implicit-def: $sgpr92 @@ -89059,7 +89001,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX11-NEXT: s_mov_b32 exec_lo, s4 ; GFX11-NEXT: v_writelane_b32 v17, s30, 0 ; GFX11-NEXT: s_cmp_lg_u32 s28, 0 -; GFX11-NEXT: s_mov_b32 vcc_lo, 0 +; GFX11-NEXT: s_mov_b32 s42, 0 ; GFX11-NEXT: v_writelane_b32 v17, s31, 1 ; GFX11-NEXT: v_writelane_b32 v17, s34, 2 ; GFX11-NEXT: v_writelane_b32 v17, s35, 3 @@ -89068,8 +89010,9 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX11-NEXT: v_writelane_b32 v17, s38, 6 ; GFX11-NEXT: v_writelane_b32 v17, s39, 7 ; GFX11-NEXT: v_writelane_b32 v17, s48, 8 -; GFX11-NEXT: v_writelane_b32 v17, s50, 9 -; GFX11-NEXT: v_writelane_b32 v17, s51, 10 +; GFX11-NEXT: v_writelane_b32 v17, s49, 9 +; GFX11-NEXT: v_writelane_b32 v17, s50, 10 +; GFX11-NEXT: v_writelane_b32 v17, s51, 11 ; GFX11-NEXT: s_cbranch_scc0 .LBB109_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_lshr_b32 s62, s27, 24 @@ -89111,7 +89054,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX11-NEXT: s_lshr_b32 s46, s1, 16 ; GFX11-NEXT: s_lshr_b32 s39, s1, 8 ; GFX11-NEXT: s_lshr_b32 s48, s0, 16 -; GFX11-NEXT: s_lshr_b32 s42, s0, 8 +; GFX11-NEXT: s_lshr_b32 s49, s0, 8 ; GFX11-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 ; GFX11-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 ; GFX11-NEXT: s_lshr_b64 s[6:7], s[22:23], 24 @@ -89120,7 +89063,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX11-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 ; GFX11-NEXT: s_lshr_b64 s[28:29], s[2:3], 24 ; GFX11-NEXT: s_lshr_b64 s[40:41], s[0:1], 24 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_lo +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 ; GFX11-NEXT: s_cbranch_vccnz .LBB109_3 ; GFX11-NEXT: .LBB109_2: ; %cmp.true ; GFX11-NEXT: s_and_b32 s4, s1, 0xffff0000 @@ -89563,10 +89506,10 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX11-NEXT: s_lshr_b64 s[10:11], s[50:51], 24 ; GFX11-NEXT: s_lshr_b32 s43, s50, 16 ; GFX11-NEXT: s_lshr_b32 s44, s50, 8 -; GFX11-NEXT: s_lshr_b32 s42, s42, 8 +; GFX11-NEXT: s_lshr_b32 s49, s42, 8 ; GFX11-NEXT: .LBB109_3: ; %end ; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_lshl_b32 s5, s42, 8 +; GFX11-NEXT: s_lshl_b32 s5, s49, 8 ; GFX11-NEXT: s_and_b32 s7, s48, 0xff ; GFX11-NEXT: s_lshl_b32 s9, s40, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s5 @@ -89722,8 +89665,9 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 ; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 ; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 -; GFX11-NEXT: v_readlane_b32 s51, v17, 10 -; GFX11-NEXT: v_readlane_b32 s50, v17, 9 +; GFX11-NEXT: v_readlane_b32 s51, v17, 11 +; GFX11-NEXT: v_readlane_b32 s50, v17, 10 +; GFX11-NEXT: v_readlane_b32 s49, v17, 9 ; GFX11-NEXT: v_readlane_b32 s48, v17, 8 ; GFX11-NEXT: v_readlane_b32 s39, v17, 7 ; GFX11-NEXT: v_readlane_b32 s38, v17, 6 @@ -89739,7 +89683,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB109_4: -; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr49 ; GFX11-NEXT: ; implicit-def: $sgpr48 ; GFX11-NEXT: ; implicit-def: $sgpr40 ; GFX11-NEXT: ; implicit-def: $sgpr39 @@ -92317,12 +92261,12 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 ; SI-NEXT: v_readfirstlane_b32 s46, v30 -; SI-NEXT: v_readfirstlane_b32 s44, v23 -; SI-NEXT: v_readfirstlane_b32 s45, v22 -; SI-NEXT: v_readfirstlane_b32 s41, v15 -; SI-NEXT: v_readfirstlane_b32 s43, v14 -; SI-NEXT: v_readfirstlane_b32 s10, v7 -; SI-NEXT: v_readfirstlane_b32 s12, v6 +; SI-NEXT: v_readfirstlane_b32 s41, v23 +; SI-NEXT: v_readfirstlane_b32 s43, v22 +; SI-NEXT: v_readfirstlane_b32 s10, v15 +; SI-NEXT: v_readfirstlane_b32 s12, v14 +; SI-NEXT: v_readfirstlane_b32 s8, v7 +; SI-NEXT: v_readfirstlane_b32 s9, v6 ; SI-NEXT: v_readfirstlane_b32 s7, v1 ; SI-NEXT: v_readfirstlane_b32 s6, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v5 @@ -92355,47 +92299,47 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s17, 24 -; SI-NEXT: s_or_b32 s8, s5, s4 +; SI-NEXT: s_or_b32 s11, s5, s4 ; SI-NEXT: s_and_b32 s4, s18, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s19, 24 -; SI-NEXT: s_or_b32 s9, s5, s4 +; SI-NEXT: s_or_b32 s13, s5, s4 ; SI-NEXT: s_and_b32 s4, s20, 0xff ; SI-NEXT: s_lshl_b32 s5, s21, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshl_b32 s11, s4, 16 +; SI-NEXT: s_lshl_b32 s14, s4, 16 ; SI-NEXT: s_and_b32 s4, s22, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s23, 24 -; SI-NEXT: s_or_b32 s13, s5, s4 +; SI-NEXT: s_or_b32 s15, s5, s4 ; SI-NEXT: s_and_b32 s4, s24, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s25, 24 -; SI-NEXT: s_or_b32 s14, s5, s4 +; SI-NEXT: s_or_b32 s40, s5, s4 ; SI-NEXT: s_and_b32 s4, s26, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s27, 24 -; SI-NEXT: s_or_b32 s15, s5, s4 +; SI-NEXT: s_or_b32 s42, s5, s4 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshl_b32 s40, s4, 16 +; SI-NEXT: s_lshl_b32 s44, s4, 16 ; SI-NEXT: s_and_b32 s4, s6, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s7, 24 -; SI-NEXT: s_or_b32 s42, s5, s4 -; SI-NEXT: s_and_b32 s4, s12, 0xff -; SI-NEXT: s_lshl_b32 s5, s10, 8 +; SI-NEXT: s_or_b32 s45, s5, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s5, s8, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_lshl_b32 s60, s4, 16 -; SI-NEXT: s_and_b32 s4, s43, 0xff -; SI-NEXT: s_lshl_b32 s5, s41, 8 +; SI-NEXT: s_and_b32 s4, s12, 0xff +; SI-NEXT: s_lshl_b32 s5, s10, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v2 ; SI-NEXT: s_lshl_b32 s61, s4, 16 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v18 -; SI-NEXT: s_and_b32 s4, s45, 0xff -; SI-NEXT: s_lshl_b32 s5, s44, 8 +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: s_lshl_b32 s5, s41, 8 ; SI-NEXT: v_and_b32_e32 v25, 0xff, v52 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v3 @@ -92543,10 +92487,10 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v13, v22, v13 ; SI-NEXT: v_or_b32_e32 v9, v13, v9 -; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x3000000, v9 -; SI-NEXT: s_and_b32 s4, s45, 0xff -; SI-NEXT: s_lshl_b32 s5, s44, 8 +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: s_lshl_b32 s5, s41, 8 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v24 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 @@ -92566,10 +92510,10 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v7, v7, v13 -; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: s_and_b32 s4, s43, 0xff -; SI-NEXT: s_lshl_b32 s5, s41, 8 +; SI-NEXT: s_and_b32 s4, s12, 0xff +; SI-NEXT: s_lshl_b32 s5, s10, 8 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v16 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 @@ -92577,10 +92521,10 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: v_or_b32_e32 v6, s4, v6 -; SI-NEXT: s_and_b32 s4, s12, 0xff -; SI-NEXT: s_lshl_b32 s5, s10, 8 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s5, s8, 8 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 @@ -92668,14 +92612,14 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 ; SI-NEXT: s_add_i32 s7, s7, 0x3000000 -; SI-NEXT: s_and_b32 s9, s7, 0xffff0000 -; SI-NEXT: s_lshl_b32 s8, s7, 16 -; SI-NEXT: s_and_b32 s13, s6, 0xffff0000 -; SI-NEXT: s_lshl_b32 s11, s6, 16 -; SI-NEXT: s_and_b32 s15, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s14, s5, 16 -; SI-NEXT: s_and_b32 s42, s4, 0xffff0000 -; SI-NEXT: s_lshl_b32 s40, s4, 16 +; SI-NEXT: s_and_b32 s13, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s7, 16 +; SI-NEXT: s_and_b32 s15, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s14, s6, 16 +; SI-NEXT: s_and_b32 s42, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s40, s5, 16 +; SI-NEXT: s_and_b32 s45, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s44, s4, 16 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v0 ; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v1 @@ -92702,14 +92646,14 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v43 ; SI-NEXT: s_branch .LBB111_5 ; SI-NEXT: .LBB111_3: -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr15 ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr45 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $sgpr60 @@ -92749,14 +92693,14 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: v_mov_b32_e32 v3, s13 -; SI-NEXT: v_mov_b32_e32 v4, s14 -; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: v_mov_b32_e32 v6, s40 -; SI-NEXT: v_mov_b32_e32 v7, s42 +; SI-NEXT: v_mov_b32_e32 v0, s11 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: v_mov_b32_e32 v3, s15 +; SI-NEXT: v_mov_b32_e32 v4, s40 +; SI-NEXT: v_mov_b32_e32 v5, s42 +; SI-NEXT: v_mov_b32_e32 v6, s44 +; SI-NEXT: v_mov_b32_e32 v7, s45 ; SI-NEXT: v_mov_b32_e32 v8, v37 ; SI-NEXT: v_mov_b32_e32 v11, v38 ; SI-NEXT: v_mov_b32_e32 v12, v48 @@ -92848,11 +92792,11 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v39 +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v39 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v48 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v49 +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v49 ; VI-NEXT: s_cbranch_scc0 .LBB111_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -92911,10 +92855,10 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: v_or_b32_sdwa v1, v43, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v45, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v45, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v47, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v57, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v57, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v32, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 @@ -92974,9 +92918,9 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v24 ; VI-NEXT: v_or_b32_sdwa v9, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v23 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v57 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45 ; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x300, v3 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v55 ; VI-NEXT: v_or_b32_sdwa v8, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -93033,8 +92977,8 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: s_addk_i32 s6, 0x300 ; VI-NEXT: s_addk_i32 s8, 0x300 ; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v57 ; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45 ; VI-NEXT: s_addk_i32 s4, 0x300 ; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_lshl_b32 s7, s7, 16 @@ -93042,8 +92986,8 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: s_and_b32 s10, s10, 0xffff ; VI-NEXT: s_and_b32 s8, s8, 0xffff ; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_or_b32 s9, s9, s10 ; VI-NEXT: s_or_b32 s7, s7, s8 ; VI-NEXT: s_or_b32 s5, s5, s6 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll index b8091d8256457..5307ab10c99d0 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll @@ -7701,7 +7701,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v8 @@ -7721,26 +7721,26 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v60, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v59, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v57, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v57, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v39, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v56, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v38, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v37, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v36, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v34, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v25, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v24, s28 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 @@ -7749,7 +7749,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v54 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 @@ -7759,12 +7759,12 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_or_b32_e32 v2, v56, v2 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v2, v42, v2 ; SI-NEXT: v_or_b32_e32 v3, v37, v3 ; SI-NEXT: v_or_b32_e32 v4, v35, v4 -; SI-NEXT: v_or_b32_e32 v5, v61, v5 +; SI-NEXT: v_or_b32_e32 v5, v33, v5 ; SI-NEXT: v_or_b32_e32 v6, v24, v6 ; SI-NEXT: v_or_b32_e32 v7, v47, v7 ; SI-NEXT: v_or_b32_e32 v8, v62, v8 @@ -7781,11 +7781,12 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -7799,7 +7800,8 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 @@ -7813,7 +7815,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v33 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 @@ -7840,7 +7842,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v56 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 @@ -7964,20 +7966,20 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB19_4: ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v48 +; SI-NEXT: v_mov_b32_e32 v42, v48 ; SI-NEXT: v_mov_b32_e32 v48, v21 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v49 +; SI-NEXT: v_mov_b32_e32 v58, v49 ; SI-NEXT: v_mov_b32_e32 v49, v20 ; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v43, v50 ; SI-NEXT: v_mov_b32_e32 v50, v22 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v44, v51 ; SI-NEXT: v_mov_b32_e32 v51, v23 @@ -7985,22 +7987,21 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v52, v27 ; SI-NEXT: v_mov_b32_e32 v46, v53 ; SI-NEXT: v_mov_b32_e32 v53, v28 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v33 ; SI-NEXT: v_mov_b32_e32 v41, v32 ; SI-NEXT: v_mov_b32_e32 v33, v47 ; SI-NEXT: v_mov_b32_e32 v47, v54 ; SI-NEXT: v_mov_b32_e32 v54, v29 -; SI-NEXT: v_mov_b32_e32 v42, v56 ; SI-NEXT: v_mov_b32_e32 v56, v55 ; SI-NEXT: v_mov_b32_e32 v55, v30 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v36, v57 +; SI-NEXT: v_mov_b32_e32 v59, v57 ; SI-NEXT: v_mov_b32_e32 v57, v40 ; SI-NEXT: v_mov_b32_e32 v40, v31 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v39, v58 -; SI-NEXT: v_mov_b32_e32 v58, v37 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_mov_b32_e32 v39, v37 ; SI-NEXT: v_mov_b32_e32 v37, v34 ; SI-NEXT: v_mov_b32_e32 v34, v24 ; SI-NEXT: v_mov_b32_e32 v32, v38 @@ -8012,34 +8013,34 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v38, v32 ; SI-NEXT: v_mov_b32_e32 v24, v34 ; SI-NEXT: v_mov_b32_e32 v34, v37 -; SI-NEXT: v_mov_b32_e32 v37, v58 -; SI-NEXT: v_mov_b32_e32 v58, v39 -; SI-NEXT: v_mov_b32_e32 v31, v40 -; SI-NEXT: v_mov_b32_e32 v40, v57 -; SI-NEXT: v_mov_b32_e32 v57, v36 +; SI-NEXT: v_mov_b32_e32 v37, v39 +; SI-NEXT: v_mov_b32_e32 v39, v36 ; SI-NEXT: v_mov_b32_e32 v30, v55 ; SI-NEXT: v_mov_b32_e32 v55, v56 -; SI-NEXT: v_mov_b32_e32 v56, v42 +; SI-NEXT: v_mov_b32_e32 v29, v54 +; SI-NEXT: v_mov_b32_e32 v54, v47 +; SI-NEXT: v_mov_b32_e32 v47, v33 ; SI-NEXT: v_mov_b32_e32 v32, v41 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v33, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v23, v51 ; SI-NEXT: v_mov_b32_e32 v51, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v22, v50 ; SI-NEXT: v_mov_b32_e32 v50, v43 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v20, v49 -; SI-NEXT: v_mov_b32_e32 v49, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v49, v58 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v21, v48 -; SI-NEXT: v_mov_b32_e32 v48, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v48, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v54 -; SI-NEXT: v_mov_b32_e32 v54, v47 -; SI-NEXT: v_mov_b32_e32 v47, v33 +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v40, v57 +; SI-NEXT: v_mov_b32_e32 v57, v59 ; SI-NEXT: v_mov_b32_e32 v28, v53 ; SI-NEXT: v_mov_b32_e32 v53, v46 ; SI-NEXT: v_mov_b32_e32 v27, v52 @@ -15402,7 +15403,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v8 @@ -15422,26 +15423,26 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v60, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v59, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v57, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v57, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v39, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v56, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v38, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v37, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v36, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v34, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v25, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v24, s28 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 @@ -15450,7 +15451,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v54 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 @@ -15460,12 +15461,12 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_or_b32_e32 v2, v56, v2 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v2, v42, v2 ; SI-NEXT: v_or_b32_e32 v3, v37, v3 ; SI-NEXT: v_or_b32_e32 v4, v35, v4 -; SI-NEXT: v_or_b32_e32 v5, v61, v5 +; SI-NEXT: v_or_b32_e32 v5, v33, v5 ; SI-NEXT: v_or_b32_e32 v6, v24, v6 ; SI-NEXT: v_or_b32_e32 v7, v47, v7 ; SI-NEXT: v_or_b32_e32 v8, v62, v8 @@ -15482,11 +15483,12 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -15500,7 +15502,8 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 @@ -15514,7 +15517,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v33 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 @@ -15541,7 +15544,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v56 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 @@ -15665,20 +15668,20 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB35_4: ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v48 +; SI-NEXT: v_mov_b32_e32 v42, v48 ; SI-NEXT: v_mov_b32_e32 v48, v21 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v49 +; SI-NEXT: v_mov_b32_e32 v58, v49 ; SI-NEXT: v_mov_b32_e32 v49, v20 ; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v43, v50 ; SI-NEXT: v_mov_b32_e32 v50, v22 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v44, v51 ; SI-NEXT: v_mov_b32_e32 v51, v23 @@ -15686,22 +15689,21 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: v_mov_b32_e32 v52, v27 ; SI-NEXT: v_mov_b32_e32 v46, v53 ; SI-NEXT: v_mov_b32_e32 v53, v28 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v33 ; SI-NEXT: v_mov_b32_e32 v41, v32 ; SI-NEXT: v_mov_b32_e32 v33, v47 ; SI-NEXT: v_mov_b32_e32 v47, v54 ; SI-NEXT: v_mov_b32_e32 v54, v29 -; SI-NEXT: v_mov_b32_e32 v42, v56 ; SI-NEXT: v_mov_b32_e32 v56, v55 ; SI-NEXT: v_mov_b32_e32 v55, v30 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v36, v57 +; SI-NEXT: v_mov_b32_e32 v59, v57 ; SI-NEXT: v_mov_b32_e32 v57, v40 ; SI-NEXT: v_mov_b32_e32 v40, v31 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v39, v58 -; SI-NEXT: v_mov_b32_e32 v58, v37 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_mov_b32_e32 v39, v37 ; SI-NEXT: v_mov_b32_e32 v37, v34 ; SI-NEXT: v_mov_b32_e32 v34, v24 ; SI-NEXT: v_mov_b32_e32 v32, v38 @@ -15713,34 +15715,34 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: v_mov_b32_e32 v38, v32 ; SI-NEXT: v_mov_b32_e32 v24, v34 ; SI-NEXT: v_mov_b32_e32 v34, v37 -; SI-NEXT: v_mov_b32_e32 v37, v58 -; SI-NEXT: v_mov_b32_e32 v58, v39 -; SI-NEXT: v_mov_b32_e32 v31, v40 -; SI-NEXT: v_mov_b32_e32 v40, v57 -; SI-NEXT: v_mov_b32_e32 v57, v36 +; SI-NEXT: v_mov_b32_e32 v37, v39 +; SI-NEXT: v_mov_b32_e32 v39, v36 ; SI-NEXT: v_mov_b32_e32 v30, v55 ; SI-NEXT: v_mov_b32_e32 v55, v56 -; SI-NEXT: v_mov_b32_e32 v56, v42 +; SI-NEXT: v_mov_b32_e32 v29, v54 +; SI-NEXT: v_mov_b32_e32 v54, v47 +; SI-NEXT: v_mov_b32_e32 v47, v33 ; SI-NEXT: v_mov_b32_e32 v32, v41 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v33, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v23, v51 ; SI-NEXT: v_mov_b32_e32 v51, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v22, v50 ; SI-NEXT: v_mov_b32_e32 v50, v43 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v20, v49 -; SI-NEXT: v_mov_b32_e32 v49, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v49, v58 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v21, v48 -; SI-NEXT: v_mov_b32_e32 v48, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v48, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v54 -; SI-NEXT: v_mov_b32_e32 v54, v47 -; SI-NEXT: v_mov_b32_e32 v47, v33 +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v40, v57 +; SI-NEXT: v_mov_b32_e32 v57, v59 ; SI-NEXT: v_mov_b32_e32 v28, v53 ; SI-NEXT: v_mov_b32_e32 v53, v46 ; SI-NEXT: v_mov_b32_e32 v27, v52 @@ -22455,7 +22457,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v8 @@ -22475,26 +22477,26 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v60, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v59, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v57, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v57, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v39, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v56, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v38, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v37, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v36, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v34, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v25, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v24, s28 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 @@ -22503,7 +22505,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v54 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 @@ -22513,12 +22515,12 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_or_b32_e32 v2, v56, v2 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v2, v42, v2 ; SI-NEXT: v_or_b32_e32 v3, v37, v3 ; SI-NEXT: v_or_b32_e32 v4, v35, v4 -; SI-NEXT: v_or_b32_e32 v5, v61, v5 +; SI-NEXT: v_or_b32_e32 v5, v33, v5 ; SI-NEXT: v_or_b32_e32 v6, v24, v6 ; SI-NEXT: v_or_b32_e32 v7, v47, v7 ; SI-NEXT: v_or_b32_e32 v8, v62, v8 @@ -22535,11 +22537,12 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -22553,7 +22556,8 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 @@ -22567,7 +22571,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v33 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 @@ -22594,7 +22598,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v56 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 @@ -22718,20 +22722,20 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v48 +; SI-NEXT: v_mov_b32_e32 v42, v48 ; SI-NEXT: v_mov_b32_e32 v48, v21 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v49 +; SI-NEXT: v_mov_b32_e32 v58, v49 ; SI-NEXT: v_mov_b32_e32 v49, v20 ; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v43, v50 ; SI-NEXT: v_mov_b32_e32 v50, v22 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v44, v51 ; SI-NEXT: v_mov_b32_e32 v51, v23 @@ -22739,22 +22743,21 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v52, v27 ; SI-NEXT: v_mov_b32_e32 v46, v53 ; SI-NEXT: v_mov_b32_e32 v53, v28 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v33 ; SI-NEXT: v_mov_b32_e32 v41, v32 ; SI-NEXT: v_mov_b32_e32 v33, v47 ; SI-NEXT: v_mov_b32_e32 v47, v54 ; SI-NEXT: v_mov_b32_e32 v54, v29 -; SI-NEXT: v_mov_b32_e32 v42, v56 ; SI-NEXT: v_mov_b32_e32 v56, v55 ; SI-NEXT: v_mov_b32_e32 v55, v30 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v36, v57 +; SI-NEXT: v_mov_b32_e32 v59, v57 ; SI-NEXT: v_mov_b32_e32 v57, v40 ; SI-NEXT: v_mov_b32_e32 v40, v31 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v39, v58 -; SI-NEXT: v_mov_b32_e32 v58, v37 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_mov_b32_e32 v39, v37 ; SI-NEXT: v_mov_b32_e32 v37, v34 ; SI-NEXT: v_mov_b32_e32 v34, v24 ; SI-NEXT: v_mov_b32_e32 v32, v38 @@ -22766,34 +22769,34 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v38, v32 ; SI-NEXT: v_mov_b32_e32 v24, v34 ; SI-NEXT: v_mov_b32_e32 v34, v37 -; SI-NEXT: v_mov_b32_e32 v37, v58 -; SI-NEXT: v_mov_b32_e32 v58, v39 -; SI-NEXT: v_mov_b32_e32 v31, v40 -; SI-NEXT: v_mov_b32_e32 v40, v57 -; SI-NEXT: v_mov_b32_e32 v57, v36 +; SI-NEXT: v_mov_b32_e32 v37, v39 +; SI-NEXT: v_mov_b32_e32 v39, v36 ; SI-NEXT: v_mov_b32_e32 v30, v55 ; SI-NEXT: v_mov_b32_e32 v55, v56 -; SI-NEXT: v_mov_b32_e32 v56, v42 +; SI-NEXT: v_mov_b32_e32 v29, v54 +; SI-NEXT: v_mov_b32_e32 v54, v47 +; SI-NEXT: v_mov_b32_e32 v47, v33 ; SI-NEXT: v_mov_b32_e32 v32, v41 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v33, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v23, v51 ; SI-NEXT: v_mov_b32_e32 v51, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v22, v50 ; SI-NEXT: v_mov_b32_e32 v50, v43 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v20, v49 -; SI-NEXT: v_mov_b32_e32 v49, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v49, v58 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v21, v48 -; SI-NEXT: v_mov_b32_e32 v48, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v48, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v54 -; SI-NEXT: v_mov_b32_e32 v54, v47 -; SI-NEXT: v_mov_b32_e32 v47, v33 +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v40, v57 +; SI-NEXT: v_mov_b32_e32 v57, v59 ; SI-NEXT: v_mov_b32_e32 v28, v53 ; SI-NEXT: v_mov_b32_e32 v53, v46 ; SI-NEXT: v_mov_b32_e32 v27, v52 @@ -28619,7 +28622,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v8 @@ -28639,26 +28642,26 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v60, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v59, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v57, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v57, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v39, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v56, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v38, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v37, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v36, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v34, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v25, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v24, s28 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 @@ -28667,7 +28670,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v54 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 @@ -28677,12 +28680,12 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_or_b32_e32 v2, v56, v2 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v2, v42, v2 ; SI-NEXT: v_or_b32_e32 v3, v37, v3 ; SI-NEXT: v_or_b32_e32 v4, v35, v4 -; SI-NEXT: v_or_b32_e32 v5, v61, v5 +; SI-NEXT: v_or_b32_e32 v5, v33, v5 ; SI-NEXT: v_or_b32_e32 v6, v24, v6 ; SI-NEXT: v_or_b32_e32 v7, v47, v7 ; SI-NEXT: v_or_b32_e32 v8, v62, v8 @@ -28699,11 +28702,12 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -28717,7 +28721,8 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 @@ -28731,7 +28736,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v33 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 @@ -28758,7 +28763,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v56 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 @@ -28882,20 +28887,20 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB55_4: ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v48 +; SI-NEXT: v_mov_b32_e32 v42, v48 ; SI-NEXT: v_mov_b32_e32 v48, v21 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v49 +; SI-NEXT: v_mov_b32_e32 v58, v49 ; SI-NEXT: v_mov_b32_e32 v49, v20 ; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v43, v50 ; SI-NEXT: v_mov_b32_e32 v50, v22 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v44, v51 ; SI-NEXT: v_mov_b32_e32 v51, v23 @@ -28903,22 +28908,21 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: v_mov_b32_e32 v52, v27 ; SI-NEXT: v_mov_b32_e32 v46, v53 ; SI-NEXT: v_mov_b32_e32 v53, v28 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v33 ; SI-NEXT: v_mov_b32_e32 v41, v32 ; SI-NEXT: v_mov_b32_e32 v33, v47 ; SI-NEXT: v_mov_b32_e32 v47, v54 ; SI-NEXT: v_mov_b32_e32 v54, v29 -; SI-NEXT: v_mov_b32_e32 v42, v56 ; SI-NEXT: v_mov_b32_e32 v56, v55 ; SI-NEXT: v_mov_b32_e32 v55, v30 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v36, v57 +; SI-NEXT: v_mov_b32_e32 v59, v57 ; SI-NEXT: v_mov_b32_e32 v57, v40 ; SI-NEXT: v_mov_b32_e32 v40, v31 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v39, v58 -; SI-NEXT: v_mov_b32_e32 v58, v37 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_mov_b32_e32 v39, v37 ; SI-NEXT: v_mov_b32_e32 v37, v34 ; SI-NEXT: v_mov_b32_e32 v34, v24 ; SI-NEXT: v_mov_b32_e32 v32, v38 @@ -28930,34 +28934,34 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: v_mov_b32_e32 v38, v32 ; SI-NEXT: v_mov_b32_e32 v24, v34 ; SI-NEXT: v_mov_b32_e32 v34, v37 -; SI-NEXT: v_mov_b32_e32 v37, v58 -; SI-NEXT: v_mov_b32_e32 v58, v39 -; SI-NEXT: v_mov_b32_e32 v31, v40 -; SI-NEXT: v_mov_b32_e32 v40, v57 -; SI-NEXT: v_mov_b32_e32 v57, v36 +; SI-NEXT: v_mov_b32_e32 v37, v39 +; SI-NEXT: v_mov_b32_e32 v39, v36 ; SI-NEXT: v_mov_b32_e32 v30, v55 ; SI-NEXT: v_mov_b32_e32 v55, v56 -; SI-NEXT: v_mov_b32_e32 v56, v42 +; SI-NEXT: v_mov_b32_e32 v29, v54 +; SI-NEXT: v_mov_b32_e32 v54, v47 +; SI-NEXT: v_mov_b32_e32 v47, v33 ; SI-NEXT: v_mov_b32_e32 v32, v41 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v33, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v23, v51 ; SI-NEXT: v_mov_b32_e32 v51, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v22, v50 ; SI-NEXT: v_mov_b32_e32 v50, v43 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v20, v49 -; SI-NEXT: v_mov_b32_e32 v49, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v49, v58 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v21, v48 -; SI-NEXT: v_mov_b32_e32 v48, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v48, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v54 -; SI-NEXT: v_mov_b32_e32 v54, v47 -; SI-NEXT: v_mov_b32_e32 v47, v33 +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v40, v57 +; SI-NEXT: v_mov_b32_e32 v57, v59 ; SI-NEXT: v_mov_b32_e32 v28, v53 ; SI-NEXT: v_mov_b32_e32 v53, v46 ; SI-NEXT: v_mov_b32_e32 v27, v52 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll index 6fe66655de3d6..fbe953d78c845 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll @@ -2828,7 +2828,7 @@ define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_mov_b32 s7, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB25_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 @@ -2836,8 +2836,8 @@ define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) { ; GFX11-NEXT: s_lshr_b32 s4, s1, 16 ; GFX11-NEXT: s_lshr_b32 s5, s1, 8 ; GFX11-NEXT: s_lshr_b32 s6, s0, 16 -; GFX11-NEXT: s_lshr_b32 s7, s0, 8 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_lshr_b32 s8, s0, 8 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s7 ; GFX11-NEXT: s_cbranch_vccnz .LBB25_3 ; GFX11-NEXT: .LBB25_2: ; %cmp.true ; GFX11-NEXT: s_add_u32 s0, s0, 3 @@ -2847,16 +2847,16 @@ define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) { ; GFX11-NEXT: s_lshr_b32 s3, s1, 24 ; GFX11-NEXT: s_lshr_b32 s4, s1, 16 ; GFX11-NEXT: s_lshr_b32 s5, s1, 8 -; GFX11-NEXT: s_lshr_b32 s7, s0, 8 +; GFX11-NEXT: s_lshr_b32 s8, s0, 8 ; GFX11-NEXT: .LBB25_3: ; %end ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s8 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s2 ; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s5 ; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB25_4: -; GFX11-NEXT: ; implicit-def: $sgpr7 +; GFX11-NEXT: ; implicit-def: $sgpr8 ; GFX11-NEXT: ; implicit-def: $sgpr6 ; GFX11-NEXT: ; implicit-def: $sgpr2 ; GFX11-NEXT: ; implicit-def: $sgpr5 @@ -8826,7 +8826,7 @@ define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inre ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_mov_b32 s7, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB69_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 @@ -8834,8 +8834,8 @@ define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inre ; GFX11-NEXT: s_lshr_b32 s4, s1, 16 ; GFX11-NEXT: s_lshr_b32 s5, s1, 8 ; GFX11-NEXT: s_lshr_b32 s6, s0, 16 -; GFX11-NEXT: s_lshr_b32 s7, s0, 8 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_lshr_b32 s8, s0, 8 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s7 ; GFX11-NEXT: s_cbranch_vccnz .LBB69_3 ; GFX11-NEXT: .LBB69_2: ; %cmp.true ; GFX11-NEXT: s_add_i32 s1, s1, 3 @@ -8845,16 +8845,16 @@ define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inre ; GFX11-NEXT: s_lshr_b32 s3, s1, 24 ; GFX11-NEXT: s_lshr_b32 s5, s1, 8 ; GFX11-NEXT: s_lshr_b32 s6, s0, 16 -; GFX11-NEXT: s_lshr_b32 s7, s0, 8 +; GFX11-NEXT: s_lshr_b32 s8, s0, 8 ; GFX11-NEXT: .LBB69_3: ; %end ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s8 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s2 ; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s5 ; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB69_4: -; GFX11-NEXT: ; implicit-def: $sgpr7 +; GFX11-NEXT: ; implicit-def: $sgpr8 ; GFX11-NEXT: ; implicit-def: $sgpr6 ; GFX11-NEXT: ; implicit-def: $sgpr2 ; GFX11-NEXT: ; implicit-def: $sgpr5 @@ -13249,47 +13249,45 @@ define <8 x i8> @bitcast_v4i16_to_v8i8(<4 x i16> %a, i32 %b) { ; VI-LABEL: bitcast_v4i16_to_v8i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: ; implicit-def: $vgpr9 -; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: ; implicit-def: $vgpr3 -; VI-NEXT: ; implicit-def: $vgpr8 ; VI-NEXT: ; implicit-def: $vgpr5 ; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; VI-NEXT: v_mov_b32_e32 v9, v0 -; VI-NEXT: v_mov_b32_e32 v8, v1 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; VI-NEXT: ; %bb.2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB96_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v2, 3 -; VI-NEXT: v_add_u16_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v8, 3, v1 +; VI-NEXT: v_mov_b32_e32 v0, 3 +; VI-NEXT: v_add_u16_sdwa v6, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v2, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v10, 3, v9 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; VI-NEXT: v_add_u16_e32 v9, 3, v0 +; VI-NEXT: v_add_u16_e32 v9, 3, v8 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; VI-NEXT: v_or_b32_e32 v1, v8, v1 +; VI-NEXT: v_or_b32_e32 v1, v10, v1 ; VI-NEXT: v_or_b32_e32 v0, v9, v0 ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; VI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; VI-NEXT: v_mov_b32_e32 v8, v9 +; VI-NEXT: v_mov_b32_e32 v9, v10 ; VI-NEXT: .LBB96_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v0, v9 -; VI-NEXT: v_mov_b32_e32 v1, v4 -; VI-NEXT: v_mov_b32_e32 v4, v8 +; VI-NEXT: v_mov_b32_e32 v0, v8 +; VI-NEXT: v_mov_b32_e32 v4, v9 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v4i16_to_v8i8: @@ -13486,48 +13484,47 @@ define inreg <8 x i8> @bitcast_v4i16_to_v8i8_scalar(<4 x i16> inreg %a, i32 inre ; VI-NEXT: s_cbranch_scc0 .LBB97_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 -; VI-NEXT: s_lshr_b32 s10, s17, 24 +; VI-NEXT: s_lshr_b32 s9, s17, 24 ; VI-NEXT: s_lshr_b32 s8, s17, 16 ; VI-NEXT: s_lshr_b32 s5, s17, 8 -; VI-NEXT: s_lshr_b32 s11, s16, 16 -; VI-NEXT: s_lshr_b32 s12, s16, 8 -; VI-NEXT: s_mov_b32 s9, s17 +; VI-NEXT: s_lshr_b32 s10, s16, 16 +; VI-NEXT: s_lshr_b32 s11, s16, 8 ; VI-NEXT: s_cbranch_execnz .LBB97_3 ; VI-NEXT: .LBB97_2: ; %cmp.true -; VI-NEXT: s_lshr_b32 s5, s17, 16 -; VI-NEXT: s_add_i32 s9, s17, 3 -; VI-NEXT: s_add_i32 s8, s5, 3 -; VI-NEXT: s_and_b32 s4, s9, 0xffff -; VI-NEXT: s_lshl_b32 s5, s8, 16 -; VI-NEXT: s_or_b32 s7, s4, s5 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_lshr_b32 s6, s17, 16 +; VI-NEXT: s_add_i32 s4, s17, 3 +; VI-NEXT: s_add_i32 s8, s6, 3 +; VI-NEXT: s_and_b32 s5, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s8, 16 +; VI-NEXT: s_or_b32 s7, s5, s6 +; VI-NEXT: s_and_b32 s5, s16, 0xffff0000 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_and_b32 s5, s16, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s6, s4, 0x30000 +; VI-NEXT: s_and_b32 s6, s16, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_add_i32 s6, s5, 0x30000 +; VI-NEXT: s_mov_b32 s17, s4 ; VI-NEXT: s_lshr_b64 s[4:5], s[6:7], 24 ; VI-NEXT: s_lshr_b32 s5, s7, 8 -; VI-NEXT: s_lshr_b32 s11, s6, 16 -; VI-NEXT: s_lshr_b32 s12, s6, 8 -; VI-NEXT: s_bfe_u32 s10, s8, 0x80008 +; VI-NEXT: s_lshr_b32 s10, s6, 16 +; VI-NEXT: s_lshr_b32 s11, s6, 8 +; VI-NEXT: s_bfe_u32 s9, s8, 0x80008 ; VI-NEXT: .LBB97_3: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s12 -; VI-NEXT: v_mov_b32_e32 v2, s11 +; VI-NEXT: v_mov_b32_e32 v1, s11 +; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s4 -; VI-NEXT: v_mov_b32_e32 v4, s9 +; VI-NEXT: v_mov_b32_e32 v4, s17 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_mov_b32_e32 v6, s8 -; VI-NEXT: v_mov_b32_e32 v7, s10 +; VI-NEXT: v_mov_b32_e32 v7, s9 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB97_4: -; VI-NEXT: ; implicit-def: $sgpr12 ; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 ; VI-NEXT: ; implicit-def: $sgpr4 -; VI-NEXT: ; implicit-def: $sgpr9 ; VI-NEXT: ; implicit-def: $sgpr5 ; VI-NEXT: ; implicit-def: $sgpr8 -; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr9 ; VI-NEXT: s_branch .LBB97_2 ; ; GFX9-LABEL: bitcast_v4i16_to_v8i8_scalar: @@ -16302,76 +16299,79 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v8i8: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[10:11], 24, v[8:9] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v9.h +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[0:1] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v1.l ; GFX11-TRUE16-NEXT: .LBB108_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v8.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v0 :: v_dual_add_f32 v0, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v9.l -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_add3_u32 v11, v4, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add3_u32 v4, v6, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v7, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v9, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v6, v7 :: v_dual_add_f32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v11, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v9, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v2, v1 -; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v3, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[10:11], 24, v[8:9] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v0, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[10:11] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 ; GFX11-TRUE16-NEXT: .LBB108_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll index 9f5c9c4c509ed..422aa01b64002 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll @@ -8429,13 +8429,13 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v39, v1 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 @@ -8463,37 +8463,36 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v33, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v62, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v63, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v61, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v29, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v28, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v27, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v26, s28 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v61 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 @@ -8508,14 +8507,14 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 ; SI-NEXT: v_or_b32_e32 v0, v33, v0 -; SI-NEXT: v_or_b32_e32 v1, v34, v1 -; SI-NEXT: v_or_b32_e32 v3, v62, v3 -; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_or_b32_e32 v2, v63, v2 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_or_b32_e32 v4, v49, v4 ; SI-NEXT: v_or_b32_e32 v5, v28, v5 ; SI-NEXT: v_or_b32_e32 v6, v26, v6 -; SI-NEXT: v_or_b32_e32 v7, v48, v7 +; SI-NEXT: v_or_b32_e32 v7, v60, v7 ; SI-NEXT: v_or_b32_e32 v8, v38, v8 -; SI-NEXT: v_or_b32_e32 v9, v49, v9 +; SI-NEXT: v_or_b32_e32 v9, v36, v9 ; SI-NEXT: v_or_b32_e32 v10, v50, v10 ; SI-NEXT: v_or_b32_e32 v11, v59, v11 ; SI-NEXT: v_or_b32_e32 v12, v57, v12 @@ -8531,65 +8530,64 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 @@ -8603,25 +8601,27 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -8641,12 +8641,12 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v39 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 @@ -8734,86 +8734,89 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB19_4: ; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v59, v46 ; SI-NEXT: v_mov_b32_e32 v46, v41 ; SI-NEXT: v_mov_b32_e32 v41, v52 ; SI-NEXT: v_mov_b32_e32 v52, v23 -; SI-NEXT: v_mov_b32_e32 v48, v60 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v39, v60 ; SI-NEXT: v_mov_b32_e32 v60, v47 ; SI-NEXT: v_mov_b32_e32 v47, v42 ; SI-NEXT: v_mov_b32_e32 v42, v53 ; SI-NEXT: v_mov_b32_e32 v53, v22 -; SI-NEXT: v_mov_b32_e32 v35, v61 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v61 ; SI-NEXT: v_mov_b32_e32 v61, v56 ; SI-NEXT: v_mov_b32_e32 v56, v43 ; SI-NEXT: v_mov_b32_e32 v43, v54 ; SI-NEXT: v_mov_b32_e32 v54, v24 -; SI-NEXT: v_mov_b32_e32 v50, v34 -; SI-NEXT: v_mov_b32_e32 v34, v62 +; SI-NEXT: v_mov_b32_e32 v50, v62 ; SI-NEXT: v_mov_b32_e32 v62, v57 ; SI-NEXT: v_mov_b32_e32 v57, v44 ; SI-NEXT: v_mov_b32_e32 v44, v55 ; SI-NEXT: v_mov_b32_e32 v55, v25 -; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v32, v51 +; SI-NEXT: v_mov_b32_e32 v51, v33 ; SI-NEXT: v_mov_b32_e32 v33, v63 ; SI-NEXT: v_mov_b32_e32 v63, v58 ; SI-NEXT: v_mov_b32_e32 v58, v45 ; SI-NEXT: v_mov_b32_e32 v45, v40 ; SI-NEXT: v_mov_b32_e32 v40, v31 -; SI-NEXT: v_mov_b32_e32 v39, v26 -; SI-NEXT: v_mov_b32_e32 v38, v27 -; SI-NEXT: v_mov_b32_e32 v37, v28 -; SI-NEXT: v_mov_b32_e32 v49, v36 -; SI-NEXT: v_mov_b32_e32 v36, v29 +; SI-NEXT: v_mov_b32_e32 v38, v26 +; SI-NEXT: v_mov_b32_e32 v37, v27 +; SI-NEXT: v_mov_b32_e32 v36, v28 +; SI-NEXT: v_mov_b32_e32 v35, v49 +; SI-NEXT: v_mov_b32_e32 v49, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v36 -; SI-NEXT: v_mov_b32_e32 v36, v49 -; SI-NEXT: v_mov_b32_e32 v28, v37 -; SI-NEXT: v_mov_b32_e32 v27, v38 -; SI-NEXT: v_mov_b32_e32 v26, v39 +; SI-NEXT: v_mov_b32_e32 v29, v49 +; SI-NEXT: v_mov_b32_e32 v49, v35 +; SI-NEXT: v_mov_b32_e32 v28, v36 +; SI-NEXT: v_mov_b32_e32 v27, v37 +; SI-NEXT: v_mov_b32_e32 v26, v38 ; SI-NEXT: v_mov_b32_e32 v31, v40 ; SI-NEXT: v_mov_b32_e32 v40, v45 ; SI-NEXT: v_mov_b32_e32 v45, v58 ; SI-NEXT: v_mov_b32_e32 v58, v63 ; SI-NEXT: v_mov_b32_e32 v63, v33 -; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v33, v51 +; SI-NEXT: v_mov_b32_e32 v51, v32 ; SI-NEXT: v_mov_b32_e32 v25, v55 ; SI-NEXT: v_mov_b32_e32 v55, v44 ; SI-NEXT: v_mov_b32_e32 v44, v57 ; SI-NEXT: v_mov_b32_e32 v57, v62 -; SI-NEXT: v_mov_b32_e32 v62, v34 -; SI-NEXT: v_mov_b32_e32 v34, v50 +; SI-NEXT: v_mov_b32_e32 v62, v50 ; SI-NEXT: v_mov_b32_e32 v24, v54 ; SI-NEXT: v_mov_b32_e32 v54, v43 ; SI-NEXT: v_mov_b32_e32 v43, v56 ; SI-NEXT: v_mov_b32_e32 v56, v61 -; SI-NEXT: v_mov_b32_e32 v61, v35 +; SI-NEXT: v_mov_b32_e32 v61, v34 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v22, v53 ; SI-NEXT: v_mov_b32_e32 v53, v42 ; SI-NEXT: v_mov_b32_e32 v42, v47 ; SI-NEXT: v_mov_b32_e32 v47, v60 -; SI-NEXT: v_mov_b32_e32 v60, v48 +; SI-NEXT: v_mov_b32_e32 v60, v39 +; SI-NEXT: v_mov_b32_e32 v39, v48 ; SI-NEXT: v_mov_b32_e32 v23, v52 ; SI-NEXT: v_mov_b32_e32 v52, v41 ; SI-NEXT: v_mov_b32_e32 v41, v46 ; SI-NEXT: v_mov_b32_e32 v46, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB19_2 ; @@ -16899,13 +16902,13 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v39, v1 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 @@ -16933,37 +16936,36 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v33, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v62, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v63, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v61, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v29, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v28, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v27, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v26, s28 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v61 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 @@ -16978,14 +16980,14 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 ; SI-NEXT: v_or_b32_e32 v0, v33, v0 -; SI-NEXT: v_or_b32_e32 v1, v34, v1 -; SI-NEXT: v_or_b32_e32 v3, v62, v3 -; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_or_b32_e32 v2, v63, v2 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_or_b32_e32 v4, v49, v4 ; SI-NEXT: v_or_b32_e32 v5, v28, v5 ; SI-NEXT: v_or_b32_e32 v6, v26, v6 -; SI-NEXT: v_or_b32_e32 v7, v48, v7 +; SI-NEXT: v_or_b32_e32 v7, v60, v7 ; SI-NEXT: v_or_b32_e32 v8, v38, v8 -; SI-NEXT: v_or_b32_e32 v9, v49, v9 +; SI-NEXT: v_or_b32_e32 v9, v36, v9 ; SI-NEXT: v_or_b32_e32 v10, v50, v10 ; SI-NEXT: v_or_b32_e32 v11, v59, v11 ; SI-NEXT: v_or_b32_e32 v12, v57, v12 @@ -17001,65 +17003,64 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 @@ -17073,25 +17074,27 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -17111,12 +17114,12 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v39 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 @@ -17204,86 +17207,89 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB35_4: ; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v59, v46 ; SI-NEXT: v_mov_b32_e32 v46, v41 ; SI-NEXT: v_mov_b32_e32 v41, v52 ; SI-NEXT: v_mov_b32_e32 v52, v23 -; SI-NEXT: v_mov_b32_e32 v48, v60 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v39, v60 ; SI-NEXT: v_mov_b32_e32 v60, v47 ; SI-NEXT: v_mov_b32_e32 v47, v42 ; SI-NEXT: v_mov_b32_e32 v42, v53 ; SI-NEXT: v_mov_b32_e32 v53, v22 -; SI-NEXT: v_mov_b32_e32 v35, v61 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v61 ; SI-NEXT: v_mov_b32_e32 v61, v56 ; SI-NEXT: v_mov_b32_e32 v56, v43 ; SI-NEXT: v_mov_b32_e32 v43, v54 ; SI-NEXT: v_mov_b32_e32 v54, v24 -; SI-NEXT: v_mov_b32_e32 v50, v34 -; SI-NEXT: v_mov_b32_e32 v34, v62 +; SI-NEXT: v_mov_b32_e32 v50, v62 ; SI-NEXT: v_mov_b32_e32 v62, v57 ; SI-NEXT: v_mov_b32_e32 v57, v44 ; SI-NEXT: v_mov_b32_e32 v44, v55 ; SI-NEXT: v_mov_b32_e32 v55, v25 -; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v32, v51 +; SI-NEXT: v_mov_b32_e32 v51, v33 ; SI-NEXT: v_mov_b32_e32 v33, v63 ; SI-NEXT: v_mov_b32_e32 v63, v58 ; SI-NEXT: v_mov_b32_e32 v58, v45 ; SI-NEXT: v_mov_b32_e32 v45, v40 ; SI-NEXT: v_mov_b32_e32 v40, v31 -; SI-NEXT: v_mov_b32_e32 v39, v26 -; SI-NEXT: v_mov_b32_e32 v38, v27 -; SI-NEXT: v_mov_b32_e32 v37, v28 -; SI-NEXT: v_mov_b32_e32 v49, v36 -; SI-NEXT: v_mov_b32_e32 v36, v29 +; SI-NEXT: v_mov_b32_e32 v38, v26 +; SI-NEXT: v_mov_b32_e32 v37, v27 +; SI-NEXT: v_mov_b32_e32 v36, v28 +; SI-NEXT: v_mov_b32_e32 v35, v49 +; SI-NEXT: v_mov_b32_e32 v49, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v36 -; SI-NEXT: v_mov_b32_e32 v36, v49 -; SI-NEXT: v_mov_b32_e32 v28, v37 -; SI-NEXT: v_mov_b32_e32 v27, v38 -; SI-NEXT: v_mov_b32_e32 v26, v39 +; SI-NEXT: v_mov_b32_e32 v29, v49 +; SI-NEXT: v_mov_b32_e32 v49, v35 +; SI-NEXT: v_mov_b32_e32 v28, v36 +; SI-NEXT: v_mov_b32_e32 v27, v37 +; SI-NEXT: v_mov_b32_e32 v26, v38 ; SI-NEXT: v_mov_b32_e32 v31, v40 ; SI-NEXT: v_mov_b32_e32 v40, v45 ; SI-NEXT: v_mov_b32_e32 v45, v58 ; SI-NEXT: v_mov_b32_e32 v58, v63 ; SI-NEXT: v_mov_b32_e32 v63, v33 -; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v33, v51 +; SI-NEXT: v_mov_b32_e32 v51, v32 ; SI-NEXT: v_mov_b32_e32 v25, v55 ; SI-NEXT: v_mov_b32_e32 v55, v44 ; SI-NEXT: v_mov_b32_e32 v44, v57 ; SI-NEXT: v_mov_b32_e32 v57, v62 -; SI-NEXT: v_mov_b32_e32 v62, v34 -; SI-NEXT: v_mov_b32_e32 v34, v50 +; SI-NEXT: v_mov_b32_e32 v62, v50 ; SI-NEXT: v_mov_b32_e32 v24, v54 ; SI-NEXT: v_mov_b32_e32 v54, v43 ; SI-NEXT: v_mov_b32_e32 v43, v56 ; SI-NEXT: v_mov_b32_e32 v56, v61 -; SI-NEXT: v_mov_b32_e32 v61, v35 +; SI-NEXT: v_mov_b32_e32 v61, v34 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v22, v53 ; SI-NEXT: v_mov_b32_e32 v53, v42 ; SI-NEXT: v_mov_b32_e32 v42, v47 ; SI-NEXT: v_mov_b32_e32 v47, v60 -; SI-NEXT: v_mov_b32_e32 v60, v48 +; SI-NEXT: v_mov_b32_e32 v60, v39 +; SI-NEXT: v_mov_b32_e32 v39, v48 ; SI-NEXT: v_mov_b32_e32 v23, v52 ; SI-NEXT: v_mov_b32_e32 v52, v41 ; SI-NEXT: v_mov_b32_e32 v41, v46 ; SI-NEXT: v_mov_b32_e32 v46, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB35_2 ; @@ -24685,13 +24691,13 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v39, v1 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 @@ -24719,37 +24725,36 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v33, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v62, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v63, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v61, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v29, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v28, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v27, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v26, s28 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v61 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 @@ -24764,14 +24769,14 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 ; SI-NEXT: v_or_b32_e32 v0, v33, v0 -; SI-NEXT: v_or_b32_e32 v1, v34, v1 -; SI-NEXT: v_or_b32_e32 v3, v62, v3 -; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_or_b32_e32 v2, v63, v2 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_or_b32_e32 v4, v49, v4 ; SI-NEXT: v_or_b32_e32 v5, v28, v5 ; SI-NEXT: v_or_b32_e32 v6, v26, v6 -; SI-NEXT: v_or_b32_e32 v7, v48, v7 +; SI-NEXT: v_or_b32_e32 v7, v60, v7 ; SI-NEXT: v_or_b32_e32 v8, v38, v8 -; SI-NEXT: v_or_b32_e32 v9, v49, v9 +; SI-NEXT: v_or_b32_e32 v9, v36, v9 ; SI-NEXT: v_or_b32_e32 v10, v50, v10 ; SI-NEXT: v_or_b32_e32 v11, v59, v11 ; SI-NEXT: v_or_b32_e32 v12, v57, v12 @@ -24787,65 +24792,64 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 @@ -24859,25 +24863,27 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -24897,12 +24903,12 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v39 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 @@ -24990,86 +24996,89 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: ; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v59, v46 ; SI-NEXT: v_mov_b32_e32 v46, v41 ; SI-NEXT: v_mov_b32_e32 v41, v52 ; SI-NEXT: v_mov_b32_e32 v52, v23 -; SI-NEXT: v_mov_b32_e32 v48, v60 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v39, v60 ; SI-NEXT: v_mov_b32_e32 v60, v47 ; SI-NEXT: v_mov_b32_e32 v47, v42 ; SI-NEXT: v_mov_b32_e32 v42, v53 ; SI-NEXT: v_mov_b32_e32 v53, v22 -; SI-NEXT: v_mov_b32_e32 v35, v61 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v61 ; SI-NEXT: v_mov_b32_e32 v61, v56 ; SI-NEXT: v_mov_b32_e32 v56, v43 ; SI-NEXT: v_mov_b32_e32 v43, v54 ; SI-NEXT: v_mov_b32_e32 v54, v24 -; SI-NEXT: v_mov_b32_e32 v50, v34 -; SI-NEXT: v_mov_b32_e32 v34, v62 +; SI-NEXT: v_mov_b32_e32 v50, v62 ; SI-NEXT: v_mov_b32_e32 v62, v57 ; SI-NEXT: v_mov_b32_e32 v57, v44 ; SI-NEXT: v_mov_b32_e32 v44, v55 ; SI-NEXT: v_mov_b32_e32 v55, v25 -; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v32, v51 +; SI-NEXT: v_mov_b32_e32 v51, v33 ; SI-NEXT: v_mov_b32_e32 v33, v63 ; SI-NEXT: v_mov_b32_e32 v63, v58 ; SI-NEXT: v_mov_b32_e32 v58, v45 ; SI-NEXT: v_mov_b32_e32 v45, v40 ; SI-NEXT: v_mov_b32_e32 v40, v31 -; SI-NEXT: v_mov_b32_e32 v39, v26 -; SI-NEXT: v_mov_b32_e32 v38, v27 -; SI-NEXT: v_mov_b32_e32 v37, v28 -; SI-NEXT: v_mov_b32_e32 v49, v36 -; SI-NEXT: v_mov_b32_e32 v36, v29 +; SI-NEXT: v_mov_b32_e32 v38, v26 +; SI-NEXT: v_mov_b32_e32 v37, v27 +; SI-NEXT: v_mov_b32_e32 v36, v28 +; SI-NEXT: v_mov_b32_e32 v35, v49 +; SI-NEXT: v_mov_b32_e32 v49, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v36 -; SI-NEXT: v_mov_b32_e32 v36, v49 -; SI-NEXT: v_mov_b32_e32 v28, v37 -; SI-NEXT: v_mov_b32_e32 v27, v38 -; SI-NEXT: v_mov_b32_e32 v26, v39 +; SI-NEXT: v_mov_b32_e32 v29, v49 +; SI-NEXT: v_mov_b32_e32 v49, v35 +; SI-NEXT: v_mov_b32_e32 v28, v36 +; SI-NEXT: v_mov_b32_e32 v27, v37 +; SI-NEXT: v_mov_b32_e32 v26, v38 ; SI-NEXT: v_mov_b32_e32 v31, v40 ; SI-NEXT: v_mov_b32_e32 v40, v45 ; SI-NEXT: v_mov_b32_e32 v45, v58 ; SI-NEXT: v_mov_b32_e32 v58, v63 ; SI-NEXT: v_mov_b32_e32 v63, v33 -; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v33, v51 +; SI-NEXT: v_mov_b32_e32 v51, v32 ; SI-NEXT: v_mov_b32_e32 v25, v55 ; SI-NEXT: v_mov_b32_e32 v55, v44 ; SI-NEXT: v_mov_b32_e32 v44, v57 ; SI-NEXT: v_mov_b32_e32 v57, v62 -; SI-NEXT: v_mov_b32_e32 v62, v34 -; SI-NEXT: v_mov_b32_e32 v34, v50 +; SI-NEXT: v_mov_b32_e32 v62, v50 ; SI-NEXT: v_mov_b32_e32 v24, v54 ; SI-NEXT: v_mov_b32_e32 v54, v43 ; SI-NEXT: v_mov_b32_e32 v43, v56 ; SI-NEXT: v_mov_b32_e32 v56, v61 -; SI-NEXT: v_mov_b32_e32 v61, v35 +; SI-NEXT: v_mov_b32_e32 v61, v34 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v22, v53 ; SI-NEXT: v_mov_b32_e32 v53, v42 ; SI-NEXT: v_mov_b32_e32 v42, v47 ; SI-NEXT: v_mov_b32_e32 v47, v60 -; SI-NEXT: v_mov_b32_e32 v60, v48 +; SI-NEXT: v_mov_b32_e32 v60, v39 +; SI-NEXT: v_mov_b32_e32 v39, v48 ; SI-NEXT: v_mov_b32_e32 v23, v52 ; SI-NEXT: v_mov_b32_e32 v52, v41 ; SI-NEXT: v_mov_b32_e32 v41, v46 ; SI-NEXT: v_mov_b32_e32 v46, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB47_2 ; @@ -31520,13 +31529,13 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v39, v1 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 @@ -31554,37 +31563,36 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v33, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v62, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v63, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v61, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v29, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v28, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v27, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v26, s28 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v61 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 @@ -31599,14 +31607,14 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 ; SI-NEXT: v_or_b32_e32 v0, v33, v0 -; SI-NEXT: v_or_b32_e32 v1, v34, v1 -; SI-NEXT: v_or_b32_e32 v3, v62, v3 -; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_or_b32_e32 v2, v63, v2 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_or_b32_e32 v4, v49, v4 ; SI-NEXT: v_or_b32_e32 v5, v28, v5 ; SI-NEXT: v_or_b32_e32 v6, v26, v6 -; SI-NEXT: v_or_b32_e32 v7, v48, v7 +; SI-NEXT: v_or_b32_e32 v7, v60, v7 ; SI-NEXT: v_or_b32_e32 v8, v38, v8 -; SI-NEXT: v_or_b32_e32 v9, v49, v9 +; SI-NEXT: v_or_b32_e32 v9, v36, v9 ; SI-NEXT: v_or_b32_e32 v10, v50, v10 ; SI-NEXT: v_or_b32_e32 v11, v59, v11 ; SI-NEXT: v_or_b32_e32 v12, v57, v12 @@ -31622,65 +31630,64 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 @@ -31694,25 +31701,27 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -31732,12 +31741,12 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v39 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 @@ -31825,86 +31834,89 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB55_4: ; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v59, v46 ; SI-NEXT: v_mov_b32_e32 v46, v41 ; SI-NEXT: v_mov_b32_e32 v41, v52 ; SI-NEXT: v_mov_b32_e32 v52, v23 -; SI-NEXT: v_mov_b32_e32 v48, v60 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v39, v60 ; SI-NEXT: v_mov_b32_e32 v60, v47 ; SI-NEXT: v_mov_b32_e32 v47, v42 ; SI-NEXT: v_mov_b32_e32 v42, v53 ; SI-NEXT: v_mov_b32_e32 v53, v22 -; SI-NEXT: v_mov_b32_e32 v35, v61 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v61 ; SI-NEXT: v_mov_b32_e32 v61, v56 ; SI-NEXT: v_mov_b32_e32 v56, v43 ; SI-NEXT: v_mov_b32_e32 v43, v54 ; SI-NEXT: v_mov_b32_e32 v54, v24 -; SI-NEXT: v_mov_b32_e32 v50, v34 -; SI-NEXT: v_mov_b32_e32 v34, v62 +; SI-NEXT: v_mov_b32_e32 v50, v62 ; SI-NEXT: v_mov_b32_e32 v62, v57 ; SI-NEXT: v_mov_b32_e32 v57, v44 ; SI-NEXT: v_mov_b32_e32 v44, v55 ; SI-NEXT: v_mov_b32_e32 v55, v25 -; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v32, v51 +; SI-NEXT: v_mov_b32_e32 v51, v33 ; SI-NEXT: v_mov_b32_e32 v33, v63 ; SI-NEXT: v_mov_b32_e32 v63, v58 ; SI-NEXT: v_mov_b32_e32 v58, v45 ; SI-NEXT: v_mov_b32_e32 v45, v40 ; SI-NEXT: v_mov_b32_e32 v40, v31 -; SI-NEXT: v_mov_b32_e32 v39, v26 -; SI-NEXT: v_mov_b32_e32 v38, v27 -; SI-NEXT: v_mov_b32_e32 v37, v28 -; SI-NEXT: v_mov_b32_e32 v49, v36 -; SI-NEXT: v_mov_b32_e32 v36, v29 +; SI-NEXT: v_mov_b32_e32 v38, v26 +; SI-NEXT: v_mov_b32_e32 v37, v27 +; SI-NEXT: v_mov_b32_e32 v36, v28 +; SI-NEXT: v_mov_b32_e32 v35, v49 +; SI-NEXT: v_mov_b32_e32 v49, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v36 -; SI-NEXT: v_mov_b32_e32 v36, v49 -; SI-NEXT: v_mov_b32_e32 v28, v37 -; SI-NEXT: v_mov_b32_e32 v27, v38 -; SI-NEXT: v_mov_b32_e32 v26, v39 +; SI-NEXT: v_mov_b32_e32 v29, v49 +; SI-NEXT: v_mov_b32_e32 v49, v35 +; SI-NEXT: v_mov_b32_e32 v28, v36 +; SI-NEXT: v_mov_b32_e32 v27, v37 +; SI-NEXT: v_mov_b32_e32 v26, v38 ; SI-NEXT: v_mov_b32_e32 v31, v40 ; SI-NEXT: v_mov_b32_e32 v40, v45 ; SI-NEXT: v_mov_b32_e32 v45, v58 ; SI-NEXT: v_mov_b32_e32 v58, v63 ; SI-NEXT: v_mov_b32_e32 v63, v33 -; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v33, v51 +; SI-NEXT: v_mov_b32_e32 v51, v32 ; SI-NEXT: v_mov_b32_e32 v25, v55 ; SI-NEXT: v_mov_b32_e32 v55, v44 ; SI-NEXT: v_mov_b32_e32 v44, v57 ; SI-NEXT: v_mov_b32_e32 v57, v62 -; SI-NEXT: v_mov_b32_e32 v62, v34 -; SI-NEXT: v_mov_b32_e32 v34, v50 +; SI-NEXT: v_mov_b32_e32 v62, v50 ; SI-NEXT: v_mov_b32_e32 v24, v54 ; SI-NEXT: v_mov_b32_e32 v54, v43 ; SI-NEXT: v_mov_b32_e32 v43, v56 ; SI-NEXT: v_mov_b32_e32 v56, v61 -; SI-NEXT: v_mov_b32_e32 v61, v35 +; SI-NEXT: v_mov_b32_e32 v61, v34 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v22, v53 ; SI-NEXT: v_mov_b32_e32 v53, v42 ; SI-NEXT: v_mov_b32_e32 v42, v47 ; SI-NEXT: v_mov_b32_e32 v47, v60 -; SI-NEXT: v_mov_b32_e32 v60, v48 +; SI-NEXT: v_mov_b32_e32 v60, v39 +; SI-NEXT: v_mov_b32_e32 v39, v48 ; SI-NEXT: v_mov_b32_e32 v23, v52 ; SI-NEXT: v_mov_b32_e32 v52, v41 ; SI-NEXT: v_mov_b32_e32 v41, v46 ; SI-NEXT: v_mov_b32_e32 v46, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB55_2 ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll index 152a48bec2636..d00cec9d58c61 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll @@ -5814,9 +5814,9 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 @@ -5832,7 +5832,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 @@ -5848,9 +5848,9 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -5867,9 +5867,9 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3 ; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 @@ -5893,9 +5893,9 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] @@ -5924,9 +5924,9 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 -; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 @@ -5942,7 +5942,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 -; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 @@ -5958,9 +5958,9 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 ; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false @@ -5977,9 +5977,9 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 -; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB15_3 ; GFX11-FAKE16-NEXT: .LBB15_2: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 @@ -6003,9 +6003,9 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] @@ -10053,9 +10053,9 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 @@ -10071,7 +10071,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 @@ -10087,9 +10087,9 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -10106,9 +10106,9 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3 ; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 @@ -10132,9 +10132,9 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] @@ -10163,9 +10163,9 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 -; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 @@ -10181,7 +10181,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 -; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 @@ -10197,9 +10197,9 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 ; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB19_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false @@ -10216,9 +10216,9 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 -; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB19_3 ; GFX11-FAKE16-NEXT: .LBB19_2: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 @@ -10242,9 +10242,9 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] @@ -15156,9 +15156,9 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 @@ -15174,7 +15174,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 @@ -15190,9 +15190,9 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -15209,9 +15209,9 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3 ; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 @@ -15235,9 +15235,9 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] @@ -15266,9 +15266,9 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 -; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 @@ -15284,7 +15284,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 -; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 @@ -15300,9 +15300,9 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 ; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB31_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false @@ -15319,9 +15319,9 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 -; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB31_3 ; GFX11-FAKE16-NEXT: .LBB31_2: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 @@ -15345,9 +15345,9 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] @@ -19385,9 +19385,9 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 @@ -19403,7 +19403,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 @@ -19419,9 +19419,9 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -19438,9 +19438,9 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3 ; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 @@ -19464,9 +19464,9 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] @@ -19495,9 +19495,9 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 -; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 @@ -19513,7 +19513,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 -; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 @@ -19529,9 +19529,9 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 ; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB35_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false @@ -19548,9 +19548,9 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 -; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB35_3 ; GFX11-FAKE16-NEXT: .LBB35_2: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 @@ -19574,9 +19574,9 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] @@ -23757,9 +23757,9 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 @@ -23775,7 +23775,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 @@ -23791,9 +23791,9 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -23810,9 +23810,9 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3 ; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 @@ -23836,9 +23836,9 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] @@ -23867,9 +23867,9 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 -; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 @@ -23885,7 +23885,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 -; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 @@ -23901,9 +23901,9 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 ; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB43_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false @@ -23920,9 +23920,9 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 -; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB43_3 ; GFX11-FAKE16-NEXT: .LBB43_2: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 @@ -23946,9 +23946,9 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] @@ -28008,9 +28008,9 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 @@ -28026,7 +28026,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 @@ -28042,9 +28042,9 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -28061,9 +28061,9 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3 ; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 @@ -28087,9 +28087,9 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] @@ -28118,9 +28118,9 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 -; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 @@ -28136,7 +28136,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 -; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 @@ -28152,9 +28152,9 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 ; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false @@ -28171,9 +28171,9 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 -; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_3 ; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 @@ -28197,9 +28197,9 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] @@ -31479,9 +31479,9 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 @@ -31497,7 +31497,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 @@ -31513,9 +31513,9 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -31532,9 +31532,9 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3 ; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 @@ -31558,9 +31558,9 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] @@ -31589,9 +31589,9 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 -; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 @@ -31607,7 +31607,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 -; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 @@ -31623,9 +31623,9 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 ; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false @@ -31642,9 +31642,9 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 -; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_3 ; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 @@ -31668,9 +31668,9 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] @@ -35626,9 +35626,9 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 @@ -35644,7 +35644,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 @@ -35660,9 +35660,9 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -35679,9 +35679,9 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3 ; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 @@ -35705,9 +35705,9 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] @@ -35736,9 +35736,9 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 -; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 @@ -35754,7 +35754,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 -; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 @@ -35770,9 +35770,9 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 ; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB55_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false @@ -35789,9 +35789,9 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 -; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB55_3 ; GFX11-FAKE16-NEXT: .LBB55_2: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 @@ -35815,9 +35815,9 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll index 97d040b545c09..87590a3ddfd06 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll @@ -6284,8 +6284,8 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 @@ -6302,7 +6302,7 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 @@ -6318,9 +6318,9 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -6339,9 +6339,9 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3 ; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 @@ -6367,9 +6367,9 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] @@ -6404,8 +6404,8 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 -; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 @@ -6422,7 +6422,7 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 @@ -6438,9 +6438,9 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 ; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false @@ -6459,9 +6459,9 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 -; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB15_3 ; GFX11-FAKE16-NEXT: .LBB15_2: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 @@ -6487,9 +6487,9 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] @@ -10062,15 +10062,15 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 @@ -10088,13 +10088,13 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v27, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v53, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v12, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v14, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 @@ -10136,9 +10136,9 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB19_4 @@ -10149,17 +10149,18 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 -; SI-NEXT: v_or_b32_e32 v0, v11, v0 ; SI-NEXT: v_or_b32_e32 v2, v14, v2 ; SI-NEXT: v_or_b32_e32 v3, v10, v3 ; SI-NEXT: v_or_b32_e32 v4, v9, v4 @@ -10167,11 +10168,12 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v46 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v55 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v47 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 @@ -10183,10 +10185,10 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v51, v46 ; SI-NEXT: v_or_b32_e32 v7, v45, v7 ; SI-NEXT: v_or_b32_e32 v8, v40, v8 -; SI-NEXT: v_or_b32_e32 v9, v55, v9 +; SI-NEXT: v_or_b32_e32 v9, v42, v9 ; SI-NEXT: v_or_b32_e32 v10, v54, v10 -; SI-NEXT: v_or_b32_e32 v11, v47, v11 -; SI-NEXT: v_or_b32_e32 v12, v60, v12 +; SI-NEXT: v_or_b32_e32 v11, v57, v11 +; SI-NEXT: v_or_b32_e32 v12, v53, v12 ; SI-NEXT: v_or_b32_e32 v13, v52, v13 ; SI-NEXT: v_or_b32_e32 v14, v63, v14 ; SI-NEXT: v_or_b32_e32 v15, v61, v15 @@ -10214,14 +10216,16 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v25, v38, v25 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -10233,8 +10237,8 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v63 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -10334,7 +10338,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -10372,7 +10376,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v55 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 @@ -10382,12 +10386,12 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 @@ -10944,8 +10948,8 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 @@ -10962,7 +10966,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 @@ -10978,9 +10982,9 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -10999,9 +11003,9 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3 ; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 @@ -11027,9 +11031,9 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] @@ -11064,8 +11068,8 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 -; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 @@ -11082,7 +11086,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 @@ -11098,9 +11102,9 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 ; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB19_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false @@ -11119,9 +11123,9 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 -; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB19_3 ; GFX11-FAKE16-NEXT: .LBB19_2: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 @@ -11147,9 +11151,9 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] @@ -16509,8 +16513,8 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 @@ -16527,7 +16531,7 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 @@ -16543,9 +16547,9 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -16564,9 +16568,9 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3 ; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 @@ -16592,9 +16596,9 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] @@ -16629,8 +16633,8 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 -; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 @@ -16647,7 +16651,7 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 @@ -16663,9 +16667,9 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 ; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB31_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false @@ -16684,9 +16688,9 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 -; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB31_3 ; GFX11-FAKE16-NEXT: .LBB31_2: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 @@ -16712,9 +16716,9 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] @@ -17881,9 +17885,9 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: s_lshr_b32 s4, s10, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: s_lshr_b32 s4, s11, 16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 ; SI-NEXT: s_lshr_b32 s4, s13, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 @@ -17927,8 +17931,8 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v32, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v34, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v36, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 @@ -17943,7 +17947,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s21 ; SI-NEXT: v_cvt_f32_f16_e32 v52, s20 ; SI-NEXT: v_cvt_f32_f16_e32 v53, s19 ; SI-NEXT: v_cvt_f32_f16_e32 v55, s18 @@ -17952,7 +17956,16 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true ; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_add_f32_e64 v6, s6, 1.0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e64 v3, s18, 1.0 ; SI-NEXT: v_add_f32_e64 v5, s19, 1.0 ; SI-NEXT: v_add_f32_e64 v7, s20, 1.0 @@ -17975,47 +17988,36 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_add_f32_e64 v12, s10, 1.0 ; SI-NEXT: v_add_f32_e64 v10, s8, 1.0 ; SI-NEXT: v_add_f32_e64 v8, s7, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s6, 1.0 ; SI-NEXT: v_add_f32_e64 v29, s9, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 ; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23 ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v21 -; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v19 -; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v17 -; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v15 -; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v13 -; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v12 -; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v10 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 @@ -18030,38 +18032,37 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v12, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v44, v4 ; SI-NEXT: .LBB33_3: ; %end ; SI-NEXT: v_cvt_f16_f32_e32 v4, v44 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v43 @@ -18098,7 +18099,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v50 ; SI-NEXT: v_add_i32_e32 v49, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v30, v4 @@ -18202,15 +18203,15 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: buffer_store_dword v4, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v38 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v4, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v46 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 @@ -18272,7 +18273,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr39 @@ -18302,9 +18303,9 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr5 @@ -20283,15 +20284,15 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 @@ -20309,13 +20310,13 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v27, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v53, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v12, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v14, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 @@ -20357,9 +20358,9 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB35_4 @@ -20370,17 +20371,18 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 -; SI-NEXT: v_or_b32_e32 v0, v11, v0 ; SI-NEXT: v_or_b32_e32 v2, v14, v2 ; SI-NEXT: v_or_b32_e32 v3, v10, v3 ; SI-NEXT: v_or_b32_e32 v4, v9, v4 @@ -20388,11 +20390,12 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v46 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v55 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v47 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 @@ -20404,10 +20407,10 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: v_mov_b32_e32 v51, v46 ; SI-NEXT: v_or_b32_e32 v7, v45, v7 ; SI-NEXT: v_or_b32_e32 v8, v40, v8 -; SI-NEXT: v_or_b32_e32 v9, v55, v9 +; SI-NEXT: v_or_b32_e32 v9, v42, v9 ; SI-NEXT: v_or_b32_e32 v10, v54, v10 -; SI-NEXT: v_or_b32_e32 v11, v47, v11 -; SI-NEXT: v_or_b32_e32 v12, v60, v12 +; SI-NEXT: v_or_b32_e32 v11, v57, v11 +; SI-NEXT: v_or_b32_e32 v12, v53, v12 ; SI-NEXT: v_or_b32_e32 v13, v52, v13 ; SI-NEXT: v_or_b32_e32 v14, v63, v14 ; SI-NEXT: v_or_b32_e32 v15, v61, v15 @@ -20435,14 +20438,16 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v25, v38, v25 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -20454,8 +20459,8 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v63 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -20555,7 +20560,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -20593,7 +20598,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v55 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 @@ -20603,12 +20608,12 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 @@ -21165,8 +21170,8 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 @@ -21183,7 +21188,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 @@ -21199,9 +21204,9 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -21220,9 +21225,9 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3 ; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 @@ -21248,9 +21253,9 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] @@ -21285,8 +21290,8 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 -; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 @@ -21303,7 +21308,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 @@ -21319,9 +21324,9 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 ; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB35_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false @@ -21340,9 +21345,9 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 -; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB35_3 ; GFX11-FAKE16-NEXT: .LBB35_2: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 @@ -21368,9 +21373,9 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] @@ -25935,8 +25940,8 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 @@ -25953,7 +25958,7 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 @@ -25969,9 +25974,9 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -25990,9 +25995,9 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3 ; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 @@ -26018,9 +26023,9 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] @@ -26055,8 +26060,8 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 -; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 @@ -26073,7 +26078,7 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 @@ -26089,9 +26094,9 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 ; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB43_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false @@ -26110,9 +26115,9 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 -; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB43_3 ; GFX11-FAKE16-NEXT: .LBB43_2: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 @@ -26138,9 +26143,9 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] @@ -29728,15 +29733,15 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 @@ -29754,13 +29759,13 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v27, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v53, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v12, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v14, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 @@ -29802,9 +29807,9 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB47_4 @@ -29815,17 +29820,18 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 -; SI-NEXT: v_or_b32_e32 v0, v11, v0 ; SI-NEXT: v_or_b32_e32 v2, v14, v2 ; SI-NEXT: v_or_b32_e32 v3, v10, v3 ; SI-NEXT: v_or_b32_e32 v4, v9, v4 @@ -29833,11 +29839,12 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v46 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v55 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v47 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 @@ -29849,10 +29856,10 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v51, v46 ; SI-NEXT: v_or_b32_e32 v7, v45, v7 ; SI-NEXT: v_or_b32_e32 v8, v40, v8 -; SI-NEXT: v_or_b32_e32 v9, v55, v9 +; SI-NEXT: v_or_b32_e32 v9, v42, v9 ; SI-NEXT: v_or_b32_e32 v10, v54, v10 -; SI-NEXT: v_or_b32_e32 v11, v47, v11 -; SI-NEXT: v_or_b32_e32 v12, v60, v12 +; SI-NEXT: v_or_b32_e32 v11, v57, v11 +; SI-NEXT: v_or_b32_e32 v12, v53, v12 ; SI-NEXT: v_or_b32_e32 v13, v52, v13 ; SI-NEXT: v_or_b32_e32 v14, v63, v14 ; SI-NEXT: v_or_b32_e32 v15, v61, v15 @@ -29880,14 +29887,16 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v25, v38, v25 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -29899,8 +29908,8 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v63 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -30000,7 +30009,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -30038,7 +30047,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v55 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 @@ -30048,12 +30057,12 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 @@ -30610,8 +30619,8 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 @@ -30628,7 +30637,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 @@ -30644,9 +30653,9 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -30665,9 +30674,9 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3 ; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 @@ -30693,9 +30702,9 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] @@ -30730,8 +30739,8 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 -; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 @@ -30748,7 +30757,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 @@ -30764,9 +30773,9 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 ; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false @@ -30785,9 +30794,9 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 -; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_3 ; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 @@ -30813,9 +30822,9 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] @@ -34455,8 +34464,8 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 @@ -34473,7 +34482,7 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 @@ -34489,9 +34498,9 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -34510,9 +34519,9 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3 ; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 @@ -34538,9 +34547,9 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] @@ -34575,8 +34584,8 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 -; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 @@ -34593,7 +34602,7 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 @@ -34609,9 +34618,9 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 ; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false @@ -34630,9 +34639,9 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 -; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_3 ; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 @@ -34658,9 +34667,9 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] @@ -35816,12 +35825,12 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v42, s40 ; SI-NEXT: s_lshr_b32 s40, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v44, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s13 @@ -35847,22 +35856,24 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 ; SI-NEXT: v_add_f64 v[54:55], s[18:19], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_add_f64 v[49:50], s[20:21], 1.0 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v2 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v55 ; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v42 -; SI-NEXT: v_add_f64 v[49:50], s[20:21], 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v44 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_add_f64 v[37:38], s[22:23], 1.0 ; SI-NEXT: v_add_f64 v[33:34], s[24:25], 1.0 ; SI-NEXT: v_add_f64 v[31:32], s[26:27], 1.0 @@ -35872,8 +35883,8 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: v_add_f64 v[14:15], s[10:11], 1.0 ; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 ; SI-NEXT: v_add_f64 v[7:8], s[6:7], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v50 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v37 ; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v38 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 @@ -35892,13 +35903,14 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v4 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 @@ -35914,9 +35926,10 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 @@ -35935,191 +35948,187 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v51 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v51, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v44, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v10, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v42 -; SI-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_or_b32_e32 v43, v43, v44 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: buffer_store_dword v43, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v13, v41, v13 -; SI-NEXT: buffer_store_dword v13, v10, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v10, v40 +; SI-NEXT: v_add_i32_e32 v43, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v41, v41, v42 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v13, v55 -; SI-NEXT: v_add_i32_e32 v55, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v55, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v41, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v55, v55, v40 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v53 -; SI-NEXT: v_add_i32_e32 v53, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v53, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v55, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v53, v53, v54 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v52 -; SI-NEXT: v_add_i32_e32 v51, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v51, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v53, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v51, v52, v51 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v50 -; SI-NEXT: v_add_i32_e32 v49, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v49, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v51, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v49, v50, v49 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v48 -; SI-NEXT: v_add_i32_e32 v39, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v39, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v49, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v39, v48, v39 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v38 -; SI-NEXT: v_add_i32_e32 v37, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v37, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v39, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v37, v38, v37 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v36 -; SI-NEXT: v_add_i32_e32 v35, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v35, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v37, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v35, v36, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v34 -; SI-NEXT: v_add_i32_e32 v33, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v33, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v35, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v33, v34, v33 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v32 -; SI-NEXT: v_add_i32_e32 v31, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v31, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v33, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v30 -; SI-NEXT: v_add_i32_e32 v28, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v28, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v31, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v29 -; SI-NEXT: v_add_i32_e32 v26, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v26, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v28, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v27 -; SI-NEXT: v_add_i32_e32 v24, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v24, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v25 -; SI-NEXT: v_add_i32_e32 v22, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v22, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v23 -; SI-NEXT: v_add_i32_e32 v20, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v20, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v21 -; SI-NEXT: v_add_i32_e32 v18, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v18, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v19 -; SI-NEXT: v_add_i32_e32 v16, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v16, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v17 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: buffer_store_dword v10, v14, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v15 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v45 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v9, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x54, v0 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x54, v0 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x5c, v0 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -36183,17 +36192,17 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_branch .LBB53_2 ; @@ -38138,15 +38147,15 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 @@ -38164,13 +38173,13 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v27, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v53, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v12, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v14, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 @@ -38212,9 +38221,9 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB55_4 @@ -38225,17 +38234,18 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 -; SI-NEXT: v_or_b32_e32 v0, v11, v0 ; SI-NEXT: v_or_b32_e32 v2, v14, v2 ; SI-NEXT: v_or_b32_e32 v3, v10, v3 ; SI-NEXT: v_or_b32_e32 v4, v9, v4 @@ -38243,11 +38253,12 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v46 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v55 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v47 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 @@ -38259,10 +38270,10 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: v_mov_b32_e32 v51, v46 ; SI-NEXT: v_or_b32_e32 v7, v45, v7 ; SI-NEXT: v_or_b32_e32 v8, v40, v8 -; SI-NEXT: v_or_b32_e32 v9, v55, v9 +; SI-NEXT: v_or_b32_e32 v9, v42, v9 ; SI-NEXT: v_or_b32_e32 v10, v54, v10 -; SI-NEXT: v_or_b32_e32 v11, v47, v11 -; SI-NEXT: v_or_b32_e32 v12, v60, v12 +; SI-NEXT: v_or_b32_e32 v11, v57, v11 +; SI-NEXT: v_or_b32_e32 v12, v53, v12 ; SI-NEXT: v_or_b32_e32 v13, v52, v13 ; SI-NEXT: v_or_b32_e32 v14, v63, v14 ; SI-NEXT: v_or_b32_e32 v15, v61, v15 @@ -38290,14 +38301,16 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: v_or_b32_e32 v25, v38, v25 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -38309,8 +38322,8 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v63 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -38410,7 +38423,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -38448,7 +38461,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v55 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 @@ -38458,12 +38471,12 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 @@ -39020,8 +39033,8 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 @@ -39038,7 +39051,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 @@ -39054,9 +39067,9 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -39075,9 +39088,9 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3 ; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 @@ -39103,9 +39116,9 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] @@ -39140,8 +39153,8 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 -; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 @@ -39158,7 +39171,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 @@ -39174,9 +39187,9 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 ; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB55_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false @@ -39195,9 +39208,9 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 -; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB55_3 ; GFX11-FAKE16-NEXT: .LBB55_2: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 @@ -39223,9 +39236,9 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] @@ -40593,12 +40606,12 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill @@ -40630,70 +40643,75 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB57_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_cvt_f32_f16_e32 v31, s22 +; SI-NEXT: v_mov_b32_e32 v46, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v44 +; SI-NEXT: v_mov_b32_e32 v44, v58 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s24 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 ; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s26 ; SI-NEXT: v_cvt_f32_f16_e32 v34, s18 ; SI-NEXT: v_cvt_f32_f16_e32 v50, s19 ; SI-NEXT: v_cvt_f32_f16_e32 v60, s20 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s28 ; SI-NEXT: v_cvt_f32_f16_e32 v35, s21 ; SI-NEXT: v_cvt_f32_f16_e32 v63, s23 ; SI-NEXT: v_cvt_f32_f16_e32 v61, s25 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 ; SI-NEXT: v_cvt_f32_f16_e32 v59, s27 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v19 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v25 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v28 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v57 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v57, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_mov_b32_e32 v56, v32 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_mov_b32_e32 v47, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: v_mov_b32_e32 v45, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v7 @@ -40746,6 +40764,24 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: .LBB57_2: ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: v_mov_b32_e32 v57, v48 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: v_mov_b32_e32 v56, v32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: v_mov_b32_e32 v47, v38 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: v_mov_b32_e32 v46, v44 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: v_mov_b32_e32 v45, v36 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: v_mov_b32_e32 v44, v58 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 @@ -40812,24 +40848,12 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: .LBB57_3: ; %Flow ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v58, v62 ; SI-NEXT: v_mov_b32_e32 v62, v32 ; SI-NEXT: v_mov_b32_e32 v32, v37 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v37, v39 ; SI-NEXT: v_mov_b32_e32 v39, v51 ; SI-NEXT: v_mov_b32_e32 v51, v53 @@ -40840,6 +40864,12 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v57 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: v_cvt_f32_f16_e32 v31, s22 ; SI-NEXT: s_add_i32 s24, s24, 3 @@ -40849,29 +40879,23 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, s24 ; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, s26 -; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45 -; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, s28 -; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47 -; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 -; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s28 ; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 ; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 @@ -40883,7 +40907,6 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v61, s25 ; SI-NEXT: v_cvt_f32_f16_e32 v59, s27 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v32, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v46 @@ -40892,8 +40915,9 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v42 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v32, v57 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload @@ -43287,7 +43311,7 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v5, v56 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v44, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v45, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v43, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v42, s25 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -43297,25 +43321,26 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 ; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 ; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 -; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v43 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v44 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v30 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 ; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v42 -; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v44 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v45 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 ; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v55 @@ -43487,13 +43512,12 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v22 ; SI-NEXT: v_or_b32_e32 v25, v25, v29 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v45 ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v42 ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v40 ; SI-NEXT: v_or_b32_e32 v24, v24, v29 ; SI-NEXT: v_or_b32_e32 v27, v27, v43 -; SI-NEXT: v_or_b32_e32 v26, v26, v45 +; SI-NEXT: v_or_b32_e32 v26, v26, v44 ; SI-NEXT: v_or_b32_e32 v21, v21, v30 ; SI-NEXT: v_or_b32_e32 v20, v20, v41 ; SI-NEXT: v_or_b32_e32 v49, v49, v46 @@ -43505,8 +43529,8 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v11, v11, v56 ; SI-NEXT: v_or_b32_e32 v6, v6, v28 ; SI-NEXT: v_or_b32_e32 v4, v4, v57 -; SI-NEXT: v_alignbit_b32 v44, v24, v43, 16 -; SI-NEXT: v_alignbit_b32 v43, v25, v45, 16 +; SI-NEXT: v_alignbit_b32 v45, v24, v43, 16 +; SI-NEXT: v_alignbit_b32 v43, v25, v44, 16 ; SI-NEXT: v_alignbit_b32 v42, v19, v30, 16 ; SI-NEXT: v_alignbit_b32 v30, v50, v41, 16 ; SI-NEXT: v_alignbit_b32 v41, v48, v46, 16 @@ -43520,7 +43544,7 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_alignbit_b32 v28, v5, v57, 16 ; SI-NEXT: .LBB59_3: ; %end ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v45 ; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v27, v27, v44 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll index a43ce77b20631..c7a5a49df1a6e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll @@ -5451,8 +5451,8 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill @@ -5479,8 +5479,8 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v1, v62, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v35, v57, s6 ; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 ; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 ; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 @@ -5530,8 +5530,8 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr61 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 @@ -5621,8 +5621,8 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v1, v62, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v35, v57, s6 ; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 ; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 ; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 @@ -10909,15 +10909,15 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v14 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 @@ -11010,9 +11010,9 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v63, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_mov_b32_e32 v49, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -11021,9 +11021,9 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v61, v44 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_mov_b32_e32 v39, v11 +; SI-NEXT: v_mov_b32_e32 v48, v11 ; SI-NEXT: v_or_b32_e32 v2, v11, v2 -; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_mov_b32_e32 v35, v10 ; SI-NEXT: v_or_b32_e32 v3, v10, v3 ; SI-NEXT: v_or_b32_e32 v4, v9, v4 ; SI-NEXT: v_or_b32_e32 v5, v8, v5 @@ -11033,7 +11033,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 @@ -11048,9 +11048,9 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v40, v56 ; SI-NEXT: v_or_b32_e32 v10, v56, v10 ; SI-NEXT: v_or_b32_e32 v11, v45, v11 -; SI-NEXT: v_or_b32_e32 v12, v38, v12 -; SI-NEXT: v_or_b32_e32 v13, v36, v13 -; SI-NEXT: v_or_b32_e32 v14, v35, v14 +; SI-NEXT: v_or_b32_e32 v12, v33, v12 +; SI-NEXT: v_or_b32_e32 v13, v38, v13 +; SI-NEXT: v_or_b32_e32 v14, v36, v14 ; SI-NEXT: v_or_b32_e32 v15, v32, v15 ; SI-NEXT: v_or_b32_e32 v17, v37, v17 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload @@ -11112,10 +11112,10 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -11125,7 +11125,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 @@ -11143,9 +11143,9 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 @@ -11258,7 +11258,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v39 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 @@ -11398,10 +11398,10 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB19_4: -; SI-NEXT: v_mov_b32_e32 v39, v11 -; SI-NEXT: v_mov_b32_e32 v33, v10 -; SI-NEXT: v_mov_b32_e32 v49, v2 -; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_mov_b32_e32 v48, v11 +; SI-NEXT: v_mov_b32_e32 v35, v10 +; SI-NEXT: v_mov_b32_e32 v63, v2 +; SI-NEXT: v_mov_b32_e32 v49, v3 ; SI-NEXT: v_mov_b32_e32 v52, v37 ; SI-NEXT: v_mov_b32_e32 v37, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 @@ -16566,8 +16566,8 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill @@ -16594,8 +16594,8 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v1, v62, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v35, v57, s6 ; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 ; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 ; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 @@ -16645,8 +16645,8 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr61 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 @@ -16736,8 +16736,8 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v1, v62, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v35, v57, s6 ; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 ; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 ; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 @@ -19380,13 +19380,13 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: s_lshr_b32 s4, s9, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 ; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 ; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: s_lshr_b32 s4, s11, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 ; SI-NEXT: s_lshr_b32 s4, s12, 16 @@ -19461,83 +19461,98 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v11, s22 ; SI-NEXT: v_cvt_f32_f16_e32 v9, s21 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s16 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true ; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e64 v3, s18, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; SI-NEXT: v_add_f32_e64 v14, s11, 1.0 ; SI-NEXT: v_add_f32_e64 v36, s6, 1.0 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e64 v10, s8, 1.0 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e64 v9, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v29, s43, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 ; SI-NEXT: v_add_f32_e64 v26, s29, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v29 +; SI-NEXT: v_add_f32_e64 v10, s8, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v58, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v44 +; SI-NEXT: v_add_f32_e64 v48, s9, 1.0 ; SI-NEXT: v_add_f32_e64 v12, s10, 1.0 -; SI-NEXT: v_add_f32_e64 v33, s7, 1.0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v3, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v5, s19, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 ; SI-NEXT: v_add_f32_e64 v7, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v9, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v11, s22, 1.0 ; SI-NEXT: v_add_f32_e64 v13, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v15, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 ; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 -; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 -; SI-NEXT: v_add_f32_e64 v24, s28, 1.0 -; SI-NEXT: v_add_f32_e64 v29, s43, 1.0 -; SI-NEXT: v_add_f32_e64 v27, s42, 1.0 ; SI-NEXT: v_add_f32_e64 v25, s41, 1.0 ; SI-NEXT: v_add_f32_e64 v23, s40, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v40 +; SI-NEXT: v_mov_b32_e32 v40, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v46 +; SI-NEXT: v_add_f32_e64 v5, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v24, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v27, s42, 1.0 ; SI-NEXT: v_add_f32_e64 v21, s15, 1.0 ; SI-NEXT: v_add_f32_e64 v19, s14, 1.0 ; SI-NEXT: v_add_f32_e64 v17, s13, 1.0 ; SI-NEXT: v_add_f32_e64 v16, s12, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v33 -; SI-NEXT: v_add_f32_e64 v48, s9, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23 +; SI-NEXT: v_add_f32_e64 v33, s7, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v27 ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v21 ; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v17 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 @@ -19548,50 +19563,40 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v44, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v45 -; SI-NEXT: v_mov_b32_e32 v45, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v46, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 -; SI-NEXT: v_mov_b32_e32 v47, v8 -; SI-NEXT: v_mov_b32_e32 v43, v34 ; SI-NEXT: .LBB33_3: ; %end ; SI-NEXT: v_cvt_f16_f32_e32 v6, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v47 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v45 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v31 @@ -19602,14 +19607,14 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: buffer_store_dword v8, v6, s[0:3], 0 offen ; SI-NEXT: v_cvt_f16_f32_e32 v6, v44 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v43 ; SI-NEXT: v_add_i32_e32 v31, vcc, 8, v0 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 ; SI-NEXT: buffer_store_dword v6, v31, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v41 ; SI-NEXT: v_add_i32_e32 v31, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 @@ -19748,26 +19753,26 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v59 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v58 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v4, v57 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 @@ -19801,13 +19806,13 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr55 @@ -19848,13 +19853,13 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: s_branch .LBB33_2 @@ -22009,15 +22014,15 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v14 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 @@ -22110,9 +22115,9 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v63, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_mov_b32_e32 v49, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -22121,9 +22126,9 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_mov_b32_e32 v61, v44 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_mov_b32_e32 v39, v11 +; SI-NEXT: v_mov_b32_e32 v48, v11 ; SI-NEXT: v_or_b32_e32 v2, v11, v2 -; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_mov_b32_e32 v35, v10 ; SI-NEXT: v_or_b32_e32 v3, v10, v3 ; SI-NEXT: v_or_b32_e32 v4, v9, v4 ; SI-NEXT: v_or_b32_e32 v5, v8, v5 @@ -22133,7 +22138,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 @@ -22148,9 +22153,9 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_mov_b32_e32 v40, v56 ; SI-NEXT: v_or_b32_e32 v10, v56, v10 ; SI-NEXT: v_or_b32_e32 v11, v45, v11 -; SI-NEXT: v_or_b32_e32 v12, v38, v12 -; SI-NEXT: v_or_b32_e32 v13, v36, v13 -; SI-NEXT: v_or_b32_e32 v14, v35, v14 +; SI-NEXT: v_or_b32_e32 v12, v33, v12 +; SI-NEXT: v_or_b32_e32 v13, v38, v13 +; SI-NEXT: v_or_b32_e32 v14, v36, v14 ; SI-NEXT: v_or_b32_e32 v15, v32, v15 ; SI-NEXT: v_or_b32_e32 v17, v37, v17 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload @@ -22212,10 +22217,10 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -22225,7 +22230,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 @@ -22243,9 +22248,9 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 @@ -22358,7 +22363,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v39 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 @@ -22498,10 +22503,10 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB35_4: -; SI-NEXT: v_mov_b32_e32 v39, v11 -; SI-NEXT: v_mov_b32_e32 v33, v10 -; SI-NEXT: v_mov_b32_e32 v49, v2 -; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_mov_b32_e32 v48, v11 +; SI-NEXT: v_mov_b32_e32 v35, v10 +; SI-NEXT: v_mov_b32_e32 v63, v2 +; SI-NEXT: v_mov_b32_e32 v49, v3 ; SI-NEXT: v_mov_b32_e32 v52, v37 ; SI-NEXT: v_mov_b32_e32 v37, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 @@ -26822,8 +26827,8 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill @@ -26850,8 +26855,8 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v1, v62, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v35, v57, s6 ; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 ; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 ; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 @@ -26901,8 +26906,8 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr61 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 @@ -26992,8 +26997,8 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v1, v62, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v35, v57, s6 ; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 ; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 ; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 @@ -32294,15 +32299,15 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v14 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 @@ -32395,9 +32400,9 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v63, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_mov_b32_e32 v49, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -32406,9 +32411,9 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v61, v44 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_mov_b32_e32 v39, v11 +; SI-NEXT: v_mov_b32_e32 v48, v11 ; SI-NEXT: v_or_b32_e32 v2, v11, v2 -; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_mov_b32_e32 v35, v10 ; SI-NEXT: v_or_b32_e32 v3, v10, v3 ; SI-NEXT: v_or_b32_e32 v4, v9, v4 ; SI-NEXT: v_or_b32_e32 v5, v8, v5 @@ -32418,7 +32423,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 @@ -32433,9 +32438,9 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v40, v56 ; SI-NEXT: v_or_b32_e32 v10, v56, v10 ; SI-NEXT: v_or_b32_e32 v11, v45, v11 -; SI-NEXT: v_or_b32_e32 v12, v38, v12 -; SI-NEXT: v_or_b32_e32 v13, v36, v13 -; SI-NEXT: v_or_b32_e32 v14, v35, v14 +; SI-NEXT: v_or_b32_e32 v12, v33, v12 +; SI-NEXT: v_or_b32_e32 v13, v38, v13 +; SI-NEXT: v_or_b32_e32 v14, v36, v14 ; SI-NEXT: v_or_b32_e32 v15, v32, v15 ; SI-NEXT: v_or_b32_e32 v17, v37, v17 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload @@ -32497,10 +32502,10 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -32510,7 +32515,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 @@ -32528,9 +32533,9 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 @@ -32643,7 +32648,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v39 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 @@ -32783,10 +32788,10 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: -; SI-NEXT: v_mov_b32_e32 v39, v11 -; SI-NEXT: v_mov_b32_e32 v33, v10 -; SI-NEXT: v_mov_b32_e32 v49, v2 -; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_mov_b32_e32 v48, v11 +; SI-NEXT: v_mov_b32_e32 v35, v10 +; SI-NEXT: v_mov_b32_e32 v63, v2 +; SI-NEXT: v_mov_b32_e32 v49, v3 ; SI-NEXT: v_mov_b32_e32 v52, v37 ; SI-NEXT: v_mov_b32_e32 v37, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 @@ -36130,8 +36135,8 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill @@ -36158,8 +36163,8 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v1, v62, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v35, v57, s6 ; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 ; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 ; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 @@ -36209,8 +36214,8 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr61 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 @@ -36300,8 +36305,8 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v1, v62, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v35, v57, s6 ; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 ; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 ; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 @@ -37781,18 +37786,18 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; kill: killed $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -37810,13 +37815,13 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v62, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 @@ -37841,8 +37846,8 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v27 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 @@ -37883,7 +37888,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v29 ; SI-NEXT: v_mov_b32_e32 v29, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill @@ -37927,41 +37932,35 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; SI-NEXT: v_add_f64 v[54:55], v[1:2], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v18 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 -; SI-NEXT: v_mov_b32_e32 v42, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 ; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 ; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 ; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 ; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 ; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 @@ -37977,9 +37976,11 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v23 ; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v24 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 @@ -38020,6 +38021,10 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 @@ -38034,11 +38039,11 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v47, v26 -; SI-NEXT: v_mov_b32_e32 v45, v27 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v55, v1 -; SI-NEXT: v_mov_b32_e32 v43, v28 +; SI-NEXT: v_mov_b32_e32 v45, v26 +; SI-NEXT: v_mov_b32_e32 v43, v27 +; SI-NEXT: v_mov_b32_e32 v42, v28 ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill @@ -38056,7 +38061,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 @@ -38223,7 +38228,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -38232,7 +38237,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -38241,7 +38246,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -38250,7 +38255,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -38277,21 +38282,21 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -38889,23 +38894,24 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s42, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v60, s42 ; SI-NEXT: s_lshr_b32 s42, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s42 -; SI-NEXT: s_lshr_b32 s42, s7, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s42 +; SI-NEXT: s_lshr_b32 s42, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 ; SI-NEXT: s_lshr_b32 s42, s6, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s42 ; SI-NEXT: s_lshr_b32 s42, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s42 ; SI-NEXT: s_lshr_b32 s42, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s42 ; SI-NEXT: s_lshr_b32 s42, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s42 ; SI-NEXT: s_lshr_b32 s42, s10, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s42 ; SI-NEXT: s_lshr_b32 s42, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s42 ; SI-NEXT: s_lshr_b32 s42, s12, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s42 ; SI-NEXT: s_lshr_b32 s42, s15, 16 @@ -38944,16 +38950,12 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v46, s42 ; SI-NEXT: s_lshr_b32 s42, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v56, s42 -; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v57, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s7 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v58, s6 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, s9 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v60, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s13 @@ -38990,14 +38992,23 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v43 -; SI-NEXT: v_add_f64 v[22:23], s[14:15], 1.0 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 +; SI-NEXT: v_add_f64 v[22:23], s[14:15], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v59 +; SI-NEXT: v_add_f64 v[18:19], s[12:13], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[6:7], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v60 +; SI-NEXT: v_mov_b32_e32 v60, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v41, v43 @@ -39005,13 +39016,12 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_add_f64 v[49:50], s[22:23], 1.0 ; SI-NEXT: v_add_f64 v[37:38], s[24:25], 1.0 ; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0 -; SI-NEXT: v_add_f64 v[7:8], s[6:7], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v49 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v37 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v38 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v7 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v39 @@ -39025,7 +39035,6 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v53, v46 ; SI-NEXT: v_add_f64 v[30:31], s[28:29], 1.0 ; SI-NEXT: v_add_f64 v[26:27], s[40:41], 1.0 -; SI-NEXT: v_add_f64 v[18:19], s[12:13], 1.0 ; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v50 ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v35 @@ -39035,20 +39044,16 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v27 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v11 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 @@ -39062,16 +39067,14 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v63 @@ -39079,11 +39082,11 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v30, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v56 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v49, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v56 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload @@ -39095,193 +39098,193 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: .LBB53_3: ; %end ; SI-NEXT: v_cvt_f16_f32_e32 v5, v56 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v45 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v14 ; SI-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v5, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v6, v14, v6 +; SI-NEXT: v_or_b32_e32 v6, v17, v6 ; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen ; SI-NEXT: v_cvt_f16_f32_e32 v5, v44 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v6, v43 -; SI-NEXT: v_add_i32_e32 v9, vcc, 8, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 8, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v5, v42 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v41 -; SI-NEXT: v_add_i32_e32 v9, vcc, 12, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v5, v55 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v40 -; SI-NEXT: v_add_i32_e32 v9, vcc, 16, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v5, v53 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v54 -; SI-NEXT: v_add_i32_e32 v9, vcc, 20, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v5, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v52 -; SI-NEXT: v_add_i32_e32 v9, vcc, 24, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v5, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 -; SI-NEXT: v_add_i32_e32 v9, vcc, 28, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v5, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 -; SI-NEXT: v_add_i32_e32 v9, vcc, 32, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 32, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v5, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v38 -; SI-NEXT: v_add_i32_e32 v9, vcc, 36, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v5, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 -; SI-NEXT: v_add_i32_e32 v9, vcc, 40, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 40, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v5, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 -; SI-NEXT: v_add_i32_e32 v9, vcc, 44, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v5, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 -; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v5, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v31 -; SI-NEXT: v_add_i32_e32 v9, vcc, 52, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 52, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v5, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v11 -; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 56, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v5, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v27 -; SI-NEXT: v_add_i32_e32 v9, vcc, 60, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v5, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_i32_e32 v6, vcc, 64, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v4 +; SI-NEXT: v_add_i32_e32 v14, vcc, 64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v23 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v5, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v23 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v5, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v5, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v19 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v60 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x54, v0 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v5, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v16 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v48 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v59 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v34 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v33 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -39345,23 +39348,23 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v14f64_to_v56f16_scalar: @@ -41486,15 +41489,15 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v14 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 @@ -41587,9 +41590,9 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v63, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_mov_b32_e32 v49, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -41598,9 +41601,9 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_mov_b32_e32 v61, v44 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_mov_b32_e32 v39, v11 +; SI-NEXT: v_mov_b32_e32 v48, v11 ; SI-NEXT: v_or_b32_e32 v2, v11, v2 -; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_mov_b32_e32 v35, v10 ; SI-NEXT: v_or_b32_e32 v3, v10, v3 ; SI-NEXT: v_or_b32_e32 v4, v9, v4 ; SI-NEXT: v_or_b32_e32 v5, v8, v5 @@ -41610,7 +41613,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 @@ -41625,9 +41628,9 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_mov_b32_e32 v40, v56 ; SI-NEXT: v_or_b32_e32 v10, v56, v10 ; SI-NEXT: v_or_b32_e32 v11, v45, v11 -; SI-NEXT: v_or_b32_e32 v12, v38, v12 -; SI-NEXT: v_or_b32_e32 v13, v36, v13 -; SI-NEXT: v_or_b32_e32 v14, v35, v14 +; SI-NEXT: v_or_b32_e32 v12, v33, v12 +; SI-NEXT: v_or_b32_e32 v13, v38, v13 +; SI-NEXT: v_or_b32_e32 v14, v36, v14 ; SI-NEXT: v_or_b32_e32 v15, v32, v15 ; SI-NEXT: v_or_b32_e32 v17, v37, v17 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload @@ -41689,10 +41692,10 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -41702,7 +41705,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 @@ -41720,9 +41723,9 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 @@ -41835,7 +41838,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v39 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 @@ -41975,10 +41978,10 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB55_4: -; SI-NEXT: v_mov_b32_e32 v39, v11 -; SI-NEXT: v_mov_b32_e32 v33, v10 -; SI-NEXT: v_mov_b32_e32 v49, v2 -; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_mov_b32_e32 v48, v11 +; SI-NEXT: v_mov_b32_e32 v35, v10 +; SI-NEXT: v_mov_b32_e32 v63, v2 +; SI-NEXT: v_mov_b32_e32 v49, v3 ; SI-NEXT: v_mov_b32_e32 v52, v37 ; SI-NEXT: v_mov_b32_e32 v37, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 @@ -44178,9 +44181,8 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 ; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(4) @@ -44226,7 +44228,7 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB57_2 @@ -44252,33 +44254,35 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v33, s21 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v37 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v35 -; SI-NEXT: v_mov_b32_e32 v47, v34 +; SI-NEXT: v_mov_b32_e32 v57, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v36 +; SI-NEXT: v_mov_b32_e32 v56, v35 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v35 +; SI-NEXT: v_mov_b32_e32 v47, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v61 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v58 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v58 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s27 @@ -44362,6 +44366,12 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: .LBB57_2: ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: v_mov_b32_e32 v57, v36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: v_mov_b32_e32 v56, v35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: v_mov_b32_e32 v47, v34 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; kill: killed $vgpr33 @@ -44449,10 +44459,6 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: .LBB57_3: ; %Flow ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) @@ -44470,6 +44476,8 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v57 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 ; SI-NEXT: s_add_i32 s18, s18, 3 @@ -44512,40 +44520,38 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s25 ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s26 -; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, s27 ; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: v_cvt_f32_f16_e32 v38, s17 ; SI-NEXT: v_cvt_f32_f16_e32 v39, s19 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s27 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v46 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v63 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v49, v62 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v58 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v61 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v48, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v57 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll index 4f46875076809..c3072b01f7db2 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll @@ -5812,8 +5812,8 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v47 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59 @@ -5848,8 +5848,8 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 ; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 ; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v63, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v34, v47, s6 ; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 ; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 @@ -5891,8 +5891,8 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr62 @@ -6006,8 +6006,8 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 ; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 ; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v63, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v34, v47, s6 ; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 ; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] @@ -7480,39 +7480,43 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill @@ -7535,10 +7539,12 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr38 @@ -7548,20 +7554,18 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -7581,18 +7585,14 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 @@ -7602,52 +7602,57 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v5 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v35 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 @@ -7655,9 +7660,6 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 @@ -7673,43 +7675,42 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v35 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v35 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v25 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v26 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v52, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 @@ -7747,37 +7748,61 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 ; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 +; SI-NEXT: v_mov_b32_e32 v61, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 @@ -7788,20 +7813,15 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 @@ -7811,15 +7831,12 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v25 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v27 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 @@ -7839,27 +7856,24 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v47, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 -; SI-NEXT: v_mov_b32_e32 v61, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 @@ -7869,46 +7883,41 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_mov_b32_e32 v37, v27 -; SI-NEXT: v_mov_b32_e32 v35, v28 -; SI-NEXT: v_mov_b32_e32 v34, v29 -; SI-NEXT: v_mov_b32_e32 v32, v30 -; SI-NEXT: v_mov_b32_e32 v63, v25 -; SI-NEXT: v_mov_b32_e32 v59, v26 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v34, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v57 +; SI-NEXT: v_mov_b32_e32 v39, v28 +; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: v_mov_b32_e32 v35, v30 +; SI-NEXT: v_mov_b32_e32 v63, v26 +; SI-NEXT: v_mov_b32_e32 v59, v27 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v57, v1 ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -7927,43 +7936,39 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 @@ -7972,7 +7977,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 @@ -7981,7 +7986,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 @@ -7990,7 +7995,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 @@ -7999,7 +8004,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 @@ -8008,7 +8013,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 @@ -8017,7 +8022,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 @@ -8026,7 +8031,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 @@ -8035,7 +8040,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 @@ -8045,8 +8050,8 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -8056,8 +8061,8 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -8067,8 +8072,8 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -8078,8 +8083,8 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -8089,8 +8094,8 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -8100,8 +8105,8 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -8110,62 +8115,66 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -11757,9 +11766,9 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 @@ -11838,9 +11847,9 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v10, v2 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 ; SI-NEXT: v_mov_b32_e32 v33, v32 @@ -11868,13 +11877,13 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 ; SI-NEXT: v_or_b32_e32 v1, v12, v1 -; SI-NEXT: v_or_b32_e32 v2, v11, v2 +; SI-NEXT: v_or_b32_e32 v3, v11, v3 ; SI-NEXT: v_or_b32_e32 v4, v9, v4 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 @@ -12039,7 +12048,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload @@ -12061,7 +12070,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(11) @@ -17822,8 +17831,8 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v47 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59 @@ -17858,8 +17867,8 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 ; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 ; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v63, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v34, v47, s6 ; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 ; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 @@ -17901,8 +17910,8 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr62 @@ -18016,8 +18025,8 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 ; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 ; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v63, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v34, v47, s6 ; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 ; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] @@ -19490,39 +19499,43 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill @@ -19545,10 +19558,12 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr38 @@ -19558,20 +19573,18 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -19591,18 +19604,14 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 @@ -19612,52 +19621,57 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v5 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v35 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 @@ -19665,9 +19679,6 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 @@ -19683,43 +19694,42 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v35 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v35 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v25 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v26 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v52, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 @@ -19757,37 +19767,61 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 ; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 ; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 +; SI-NEXT: v_mov_b32_e32 v61, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v47 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 ; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 @@ -19798,20 +19832,15 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 ; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 ; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 ; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 ; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 @@ -19821,15 +19850,12 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v25 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v27 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 @@ -19855,70 +19881,62 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 -; SI-NEXT: v_mov_b32_e32 v61, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v47, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_mov_b32_e32 v37, v27 -; SI-NEXT: v_mov_b32_e32 v35, v28 -; SI-NEXT: v_mov_b32_e32 v34, v29 -; SI-NEXT: v_mov_b32_e32 v32, v30 -; SI-NEXT: v_mov_b32_e32 v63, v25 -; SI-NEXT: v_mov_b32_e32 v59, v26 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v34, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v57 +; SI-NEXT: v_mov_b32_e32 v39, v28 +; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: v_mov_b32_e32 v35, v30 +; SI-NEXT: v_mov_b32_e32 v63, v26 +; SI-NEXT: v_mov_b32_e32 v59, v27 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v57, v1 ; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -19937,43 +19955,39 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 @@ -19982,7 +19996,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 @@ -19991,7 +20005,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 @@ -20000,7 +20014,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 @@ -20009,7 +20023,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 @@ -20018,7 +20032,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 @@ -20027,7 +20041,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 @@ -20036,7 +20050,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 @@ -20045,7 +20059,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 @@ -20055,8 +20069,8 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -20066,8 +20080,8 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -20077,8 +20091,8 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -20088,8 +20102,8 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -20099,8 +20113,8 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -20110,8 +20124,8 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -20120,62 +20134,66 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -23744,9 +23762,9 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 @@ -23825,9 +23843,9 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v10, v2 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 ; SI-NEXT: v_mov_b32_e32 v33, v32 @@ -23855,13 +23873,13 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 ; SI-NEXT: v_or_b32_e32 v1, v12, v1 -; SI-NEXT: v_or_b32_e32 v2, v11, v2 +; SI-NEXT: v_or_b32_e32 v3, v11, v3 ; SI-NEXT: v_or_b32_e32 v4, v9, v4 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 @@ -24026,7 +24044,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload @@ -24048,7 +24066,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(11) @@ -28934,8 +28952,8 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v47 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59 @@ -28970,8 +28988,8 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 ; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 ; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v63, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v34, v47, s6 ; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 ; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 @@ -29013,8 +29031,8 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr62 @@ -29128,8 +29146,8 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 ; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 ; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v63, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v34, v47, s6 ; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 ; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] @@ -30602,39 +30620,43 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill @@ -30657,9 +30679,11 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr48 @@ -30670,20 +30694,18 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -30703,18 +30725,14 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 @@ -30724,52 +30742,57 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 @@ -30777,11 +30800,6 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 @@ -30795,43 +30813,44 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v35 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v35 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v25 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v26 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v40, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 @@ -30872,9 +30891,9 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc @@ -30892,38 +30911,57 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 ; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 ; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 -; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 -; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v43 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 +; SI-NEXT: v_mov_b32_e32 v61, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v47 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 @@ -30934,15 +30972,12 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v25 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v27 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 @@ -30968,21 +31003,15 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 -; SI-NEXT: v_mov_b32_e32 v61, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 @@ -30993,45 +31022,43 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_mov_b32_e32 v37, v27 -; SI-NEXT: v_mov_b32_e32 v35, v28 -; SI-NEXT: v_mov_b32_e32 v34, v29 -; SI-NEXT: v_mov_b32_e32 v32, v30 -; SI-NEXT: v_mov_b32_e32 v63, v25 -; SI-NEXT: v_mov_b32_e32 v59, v26 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v34, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v57 +; SI-NEXT: v_mov_b32_e32 v39, v28 +; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: v_mov_b32_e32 v35, v30 +; SI-NEXT: v_mov_b32_e32 v63, v26 +; SI-NEXT: v_mov_b32_e32 v59, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v47, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v57, v1 ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -31050,34 +31077,30 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 @@ -31086,7 +31109,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 @@ -31095,7 +31118,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 @@ -31104,7 +31127,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 @@ -31113,7 +31136,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 @@ -31122,7 +31145,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 @@ -31131,7 +31154,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 @@ -31140,7 +31163,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 @@ -31149,7 +31172,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 @@ -31158,7 +31181,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 @@ -31168,8 +31191,8 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -31179,8 +31202,8 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -31190,8 +31213,8 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -31201,8 +31224,8 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -31212,8 +31235,8 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -31223,8 +31246,8 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -31233,62 +31256,66 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -34896,9 +34923,9 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 @@ -34977,9 +35004,9 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v10, v2 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 ; SI-NEXT: v_mov_b32_e32 v33, v32 @@ -35007,13 +35034,13 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 ; SI-NEXT: v_or_b32_e32 v1, v12, v1 -; SI-NEXT: v_or_b32_e32 v2, v11, v2 +; SI-NEXT: v_or_b32_e32 v3, v11, v3 ; SI-NEXT: v_or_b32_e32 v4, v9, v4 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 @@ -35178,7 +35205,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload @@ -35200,7 +35227,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(11) @@ -39052,8 +39079,8 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v47 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59 @@ -39088,8 +39115,8 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 ; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 ; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v63, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v34, v47, s6 ; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 ; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 @@ -39131,8 +39158,8 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr62 @@ -39246,8 +39273,8 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 ; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 ; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v63, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v34, v47, s6 ; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 ; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] @@ -42027,29 +42054,27 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s44, s5, 16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, s44 -; SI-NEXT: s_lshr_b32 s44, s4, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v36, s44 +; SI-NEXT: s_lshr_b32 s44, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s44 ; SI-NEXT: s_lshr_b32 s44, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s44 ; SI-NEXT: s_lshr_b32 s44, s6, 16 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v61, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s44 ; SI-NEXT: s_lshr_b32 s44, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s44 ; SI-NEXT: s_lshr_b32 s44, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s44 ; SI-NEXT: s_lshr_b32 s44, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s44 ; SI-NEXT: s_lshr_b32 s44, s10, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s44 ; SI-NEXT: s_lshr_b32 s44, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s44 ; SI-NEXT: s_lshr_b32 s44, s12, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s44 ; SI-NEXT: s_lshr_b32 s44, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s44 ; SI-NEXT: s_lshr_b32 s44, s14, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v22, s44 ; SI-NEXT: s_lshr_b32 s44, s41, 16 @@ -42085,21 +42110,24 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: s_lshr_b32 s44, s18, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v56, s44 ; SI-NEXT: s_lshr_b32 s44, s17, 16 +; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v58, s44 ; SI-NEXT: s_lshr_b32 s44, s16, 16 +; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v60, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s10 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v61, s6 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v62, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s10 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v63, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s15 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s14 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s41 @@ -42117,9 +42145,9 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v42, s21 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s18 ; SI-NEXT: v_cvt_f32_f16_e32 v57, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s16 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true ; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 @@ -42140,104 +42168,107 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v42 -; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v53 ; SI-NEXT: v_add_f64 v[20:21], s[12:13], 1.0 +; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v54 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v57 ; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_add_f64 v[37:38], s[26:27], 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v15 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_add_f64 v[7:8], s[6:7], 1.0 ; SI-NEXT: v_add_f64 v[34:35], s[28:29], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v55 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v41 +; SI-NEXT: v_add_f64 v[37:38], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v44 ; SI-NEXT: v_add_f64 v[30:31], s[42:43], 1.0 -; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v46 ; SI-NEXT: v_add_f64 v[49:50], s[24:25], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v46 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 ; SI-NEXT: v_add_f64 v[26:27], s[40:41], 1.0 ; SI-NEXT: v_add_f64 v[22:23], s[14:15], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v5 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v59 -; SI-NEXT: v_mov_b32_e32 v59, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 +; SI-NEXT: v_mov_b32_e32 v18, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 -; SI-NEXT: v_mov_b32_e32 v13, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v53, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) @@ -42256,9 +42287,9 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v60, v6 ; SI-NEXT: .LBB53_3: ; %end ; SI-NEXT: v_cvt_f16_f32_e32 v6, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v59 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v9, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v58 @@ -42270,7 +42301,7 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: buffer_store_dword v9, v6, s[0:3], 0 offen ; SI-NEXT: v_cvt_f16_f32_e32 v6, v56 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v47 ; SI-NEXT: v_add_i32_e32 v10, vcc, 8, v0 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v9, v6 @@ -42340,125 +42371,125 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 -; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v3 +; SI-NEXT: v_add_i32_e32 v10, vcc, 48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v3, v3, v6 -; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v35 -; SI-NEXT: v_add_i32_e32 v9, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v6, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v35 +; SI-NEXT: v_add_i32_e32 v10, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 -; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 +; SI-NEXT: v_add_i32_e32 v10, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v31 -; SI-NEXT: v_add_i32_e32 v9, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v31 +; SI-NEXT: v_add_i32_e32 v10, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v11 -; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v27 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v6, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v27 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v6, v22 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v23 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v63 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v62 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v4, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 +; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v45 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v40 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -42482,11 +42513,11 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr46 @@ -42521,27 +42552,27 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v15f64_to_v60f16_scalar: @@ -44881,9 +44912,9 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 @@ -44962,9 +44993,9 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v10, v2 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 ; SI-NEXT: v_mov_b32_e32 v33, v32 @@ -44992,13 +45023,13 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 ; SI-NEXT: v_or_b32_e32 v1, v12, v1 -; SI-NEXT: v_or_b32_e32 v2, v11, v2 +; SI-NEXT: v_or_b32_e32 v3, v11, v3 ; SI-NEXT: v_or_b32_e32 v4, v9, v4 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 @@ -45163,7 +45194,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload @@ -45185,7 +45216,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(11) @@ -47867,7 +47898,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB57_2 @@ -47889,118 +47920,123 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v39 +; SI-NEXT: v_mov_b32_e32 v48, v39 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v39 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v37 +; SI-NEXT: v_mov_b32_e32 v39, v38 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v38 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v7 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 +; SI-NEXT: v_mov_b32_e32 v38, v37 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s23 -; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v37 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v9 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v34 +; SI-NEXT: v_mov_b32_e32 v37, v36 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v10 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s25 -; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v36 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v11 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 +; SI-NEXT: v_mov_b32_e32 v36, v35 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s27 -; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 +; SI-NEXT: v_mov_b32_e32 v35, v34 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s29 -; SI-NEXT: v_mov_b32_e32 v32, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v34 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v15 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v50 +; SI-NEXT: v_mov_b32_e32 v34, v33 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v17 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v62 +; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v60 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 +; SI-NEXT: v_mov_b32_e32 v32, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v50 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v61 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v60 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v51, v58 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 @@ -48029,6 +48065,31 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr49 ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 ; SI-NEXT: v_mov_b32_e32 v35, v34 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 @@ -48096,26 +48157,6 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: ; kill: killed $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -48151,6 +48192,11 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v33 ; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v34 ; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v48 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_cvt_f32_f16_e32 v49, s16 ; SI-NEXT: s_add_i32 s17, s17, 3 @@ -48193,50 +48239,45 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s23 ; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v39 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s24 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v36 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s25 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v63 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v60 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v58 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v62 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v58 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) @@ -49746,48 +49787,48 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v4 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v40, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v6, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v51 @@ -49797,7 +49838,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v44, v62 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v63 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v43, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v54 @@ -49811,28 +49852,26 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f16_f32_e32 v30, v56 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v31 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v36, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v32 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v45, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v58 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v34 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v35 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v33, v37 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v42, v3 ; SI-NEXT: s_waitcnt vmcnt(5) @@ -49840,21 +49879,19 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v39, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v35 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v41, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v37 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v45 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v46 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: v_mov_b32_e32 v47, v21 -; SI-NEXT: v_mov_b32_e32 v56, v17 -; SI-NEXT: v_mov_b32_e32 v57, v6 +; SI-NEXT: v_mov_b32_e32 v56, v6 ; SI-NEXT: v_mov_b32_e32 v58, v7 -; SI-NEXT: v_mov_b32_e32 v59, v33 ; SI-NEXT: s_xor_b64 exec, exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true @@ -49862,34 +49899,34 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v3, v3, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v10 -; SI-NEXT: v_or_b32_e32 v9, v9, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v31, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v62 -; SI-NEXT: v_or_b32_e32 v63, v6, v34 +; SI-NEXT: v_or_b32_e32 v3, v3, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v34 +; SI-NEXT: v_or_b32_e32 v31, v31, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v63 +; SI-NEXT: v_or_b32_e32 v57, v6, v35 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 @@ -49905,7 +49942,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -49916,114 +49953,119 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_or_b32_e32 v12, v12, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v16 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_or_b32_e32 v12, v12, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_or_b32_e32 v15, v15, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v19 +; SI-NEXT: v_or_b32_e32 v15, v15, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_or_b32_e32 v18, v18, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v22, v22, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27 +; SI-NEXT: v_or_b32_e32 v26, v26, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v8 +; SI-NEXT: v_or_b32_e32 v11, v11, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v59 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v35 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_or_b32_e32 v18, v18, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v22, v22, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v27 -; SI-NEXT: v_or_b32_e32 v26, v26, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v5 -; SI-NEXT: v_or_b32_e32 v11, v11, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v20 -; SI-NEXT: v_or_b32_e32 v2, v2, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v34 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v37, v34, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v48 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v58 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v34 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_or_b32_e32 v48, v34, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v2, v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v35 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v48, v35, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v34 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v35 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v53 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_or_b32_e32 v52, v34, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v55 +; SI-NEXT: v_or_b32_e32 v52, v35, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v55 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v34 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v35 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_or_b32_e32 v55, v34, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v60 +; SI-NEXT: v_or_b32_e32 v55, v35, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v60 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v6 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_or_b32_e32 v6, v35, v34 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_or_b32_e32 v6, v37, v35 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 ; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 ; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 @@ -50034,7 +50076,10 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 @@ -50042,158 +50087,145 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 ; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 ; SI-NEXT: v_or_b32_e32 v25, v25, v24 ; SI-NEXT: v_or_b32_e32 v29, v29, v28 ; SI-NEXT: v_or_b32_e32 v54, v54, v51 ; SI-NEXT: v_or_b32_e32 v50, v50, v30 +; SI-NEXT: v_or_b32_e32 v36, v36, v32 +; SI-NEXT: v_or_b32_e32 v33, v33, v42 ; SI-NEXT: v_or_b32_e32 v39, v39, v41 -; SI-NEXT: v_alignbit_b32 v60, v55, v34, 16 +; SI-NEXT: v_alignbit_b32 v60, v55, v35, 16 +; SI-NEXT: v_alignbit_b32 v59, v52, v45, 16 ; SI-NEXT: v_alignbit_b32 v24, v26, v24, 16 ; SI-NEXT: v_alignbit_b32 v28, v22, v28, 16 ; SI-NEXT: v_alignbit_b32 v51, v12, v51, 16 -; SI-NEXT: v_alignbit_b32 v30, v63, v30, 16 +; SI-NEXT: v_alignbit_b32 v30, v57, v30, 16 +; SI-NEXT: v_alignbit_b32 v32, v31, v32, 16 +; SI-NEXT: v_alignbit_b32 v42, v9, v42, 16 ; SI-NEXT: v_alignbit_b32 v41, v3, v41, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_or_b32_e32 v6, v35, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v46, v6 +; SI-NEXT: v_or_b32_e32 v6, v37, v45 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v58 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_alignbit_b32 v1, v52, v1, 16 +; SI-NEXT: v_or_b32_e32 v6, v37, v5 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 -; SI-NEXT: v_or_b32_e32 v58, v35, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v56 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_alignbit_b32 v8, v48, v8, 16 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_or_b32_e32 v57, v46, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_alignbit_b32 v5, v48, v5, 16 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_or_b32_e32 v58, v46, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v47 -; SI-NEXT: v_alignbit_b32 v14, v37, v14, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v56, v35, v17 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v59 +; SI-NEXT: v_alignbit_b32 v14, v2, v14, 16 +; SI-NEXT: v_or_b32_e32 v56, v37, v17 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v62 +; SI-NEXT: v_alignbit_b32 v17, v1, v17, 16 ; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_or_b32_e32 v59, v46, v43 +; SI-NEXT: v_or_b32_e32 v62, v46, v43 ; SI-NEXT: v_alignbit_b32 v43, v15, v43, 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v47, v35, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v44 +; SI-NEXT: v_or_b32_e32 v47, v37, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v61 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; SI-NEXT: v_or_b32_e32 v61, v44, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v45 -; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v44 -; SI-NEXT: v_or_b32_e32 v36, v36, v45 -; SI-NEXT: v_alignbit_b32 v44, v18, v35, 16 -; SI-NEXT: v_alignbit_b32 v45, v31, v45, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v6 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_or_b32_e32 v6, v33, v42 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v2, v17, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v6, v11, v21, 16 -; SI-NEXT: v_alignbit_b32 v42, v9, v42, 16 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_or_b32_e32 v61, v44, v37 +; SI-NEXT: v_alignbit_b32 v44, v18, v37, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v34, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v34, v34, v35 -; SI-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v35, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v35, v35, v37 +; SI-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v34, 0xffff, v55 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40 -; SI-NEXT: v_or_b32_e32 v34, v34, v35 -; SI-NEXT: v_add_i32_e32 v35, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v34, v35, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v35, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v40 +; SI-NEXT: v_or_b32_e32 v35, v35, v37 +; SI-NEXT: v_add_i32_e32 v37, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v59 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_and_b32_e32 v34, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v1, v34, v1 -; SI-NEXT: v_add_i32_e32 v34, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v52 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v34 -; SI-NEXT: v_add_i32_e32 v34, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v14 -; SI-NEXT: v_or_b32_e32 v1, v1, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v35, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v35, v35, v37 +; SI-NEXT: v_add_i32_e32 v37, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v35, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v53 +; SI-NEXT: v_or_b32_e32 v35, v35, v37 +; SI-NEXT: v_add_i32_e32 v37, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v35, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v5, v35, v5 +; SI-NEXT: v_add_i32_e32 v35, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v5, v35, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v37 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v49 +; SI-NEXT: v_or_b32_e32 v5, v5, v35 +; SI-NEXT: v_add_i32_e32 v35, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v5, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v58 +; SI-NEXT: v_or_b32_e32 v5, v5, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v6 -; SI-NEXT: v_or_b32_e32 v1, v1, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 @@ -50208,7 +50240,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -50249,7 +50281,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 @@ -50279,28 +50311,26 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -50995,9 +51025,9 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v6 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v42, v7 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 @@ -51082,8 +51112,7 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v33, v11 ; SI-NEXT: v_mov_b32_e32 v11, v8 ; SI-NEXT: v_mov_b32_e32 v8, v5 -; SI-NEXT: v_mov_b32_e32 v5, v42 -; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: s_cbranch_vccnz .LBB59_5 ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll index e5245f7bd71d3..d36f879b378b2 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll @@ -7772,11 +7772,11 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v13 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[10:11] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[12:13] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v13.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v13.h +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[12:13] ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v12.l ; GFX11-TRUE16-NEXT: .LBB38_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB38_4 @@ -7847,6 +7847,7 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v10 ; GFX11-TRUE16-NEXT: .LBB38_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v12.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v14.l @@ -10731,62 +10732,57 @@ define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) { ; VI-LABEL: bitcast_v6i16_to_v12i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v8, v2 +; VI-NEXT: v_mov_b32_e32 v14, v1 +; VI-NEXT: v_mov_b32_e32 v13, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; VI-NEXT: ; implicit-def: $vgpr16 -; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: ; implicit-def: $vgpr3 -; VI-NEXT: ; implicit-def: $vgpr14 ; VI-NEXT: ; implicit-def: $vgpr5 ; VI-NEXT: ; implicit-def: $vgpr7 -; VI-NEXT: ; implicit-def: $vgpr8 ; VI-NEXT: ; implicit-def: $vgpr9 ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v0 -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[2:3] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: v_mov_b32_e32 v16, v0 -; VI-NEXT: v_mov_b32_e32 v14, v1 -; VI-NEXT: v_mov_b32_e32 v8, v2 -; VI-NEXT: ; implicit-def: $vgpr1 -; VI-NEXT: ; implicit-def: $vgpr2 -; VI-NEXT: .LBB46_2: ; %Flow +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v14 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v13 +; VI-NEXT: ; %bb.2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB46_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v3, 3 -; VI-NEXT: v_add_u16_sdwa v6, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v13, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v14, 3, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; VI-NEXT: v_add_u16_e32 v16, 3, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 -; VI-NEXT: v_or_b32_e32 v1, v14, v1 -; VI-NEXT: v_or_b32_e32 v0, v16, v0 -; VI-NEXT: v_add_u16_sdwa v10, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v8, 3, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; VI-NEXT: v_add_u16_sdwa v6, v14, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v15, 3, v14 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; VI-NEXT: v_add_u16_sdwa v2, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v10, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v15, v0 +; VI-NEXT: v_add_u16_e32 v14, 3, v13 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; VI-NEXT: v_add_u16_e32 v16, 3, v8 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; VI-NEXT: v_or_b32_e32 v0, v14, v0 +; VI-NEXT: v_or_b32_e32 v7, v16, v3 ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: v_or_b32_e32 v2, v8, v2 -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[2:3] -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[7:8] +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v7 ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; VI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; VI-NEXT: v_mov_b32_e32 v13, v14 +; VI-NEXT: v_mov_b32_e32 v14, v15 +; VI-NEXT: v_mov_b32_e32 v8, v16 ; VI-NEXT: .LBB46_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v0, v16 -; VI-NEXT: v_mov_b32_e32 v1, v15 -; VI-NEXT: v_mov_b32_e32 v2, v13 +; VI-NEXT: v_mov_b32_e32 v0, v13 ; VI-NEXT: v_mov_b32_e32 v4, v14 ; VI-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 4cc39d93854a0..761a568400c93 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -7628,8 +7628,8 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7LESS-NEXT: s_load_dword s10, s[4:5], 0xd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v4, s7, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB13_4 @@ -7649,27 +7649,28 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX7LESS-NEXT: s_not_b32 s13, s12 ; GFX7LESS-NEXT: s_lshl_b32 s14, s6, s11 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s15 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s15 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s14, v1 -; GFX7LESS-NEXT: v_and_b32_e32 v0, s12, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_and_b32_e32 v2, s13, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s14, v4 +; GFX7LESS-NEXT: v_and_b32_e32 v0, s12, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v1, s13, v4 +; GFX7LESS-NEXT: v_or_b32_e32 v3, v1, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2 ; GFX7LESS-NEXT: ; %bb.3: ; %atomicrmw.end ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s11, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s11, v0 ; GFX7LESS-NEXT: .LBB13_4: ; %Flow ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -7679,7 +7680,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 ; GFX7LESS-NEXT: s_and_b32 s5, s10, 0xff ; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 -; GFX7LESS-NEXT: v_mad_u32_u24 v0, s5, v4, v0 +; GFX7LESS-NEXT: v_mad_u32_u24 v0, s5, v2, v0 ; GFX7LESS-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -7689,8 +7690,8 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX8-NEXT: s_load_dword s10, s[4:5], 0x34 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX8-NEXT: s_cbranch_execz .LBB13_4 @@ -7709,27 +7710,27 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX8-NEXT: s_lshl_b32 s14, s2, s11 ; GFX8-NEXT: s_mov_b64 s[2:3], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s14, v1 -; GFX8-NEXT: v_and_b32_e32 v2, s13, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s14, v4 +; GFX8-NEXT: v_and_b32_e32 v1, s13, v4 ; GFX8-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX8-NEXT: v_or_b32_e32 v3, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_cbranch_execnz .LBB13_2 ; GFX8-NEXT: ; %bb.3: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s11, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s11, v0 ; GFX8-NEXT: .LBB13_4: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -7738,7 +7739,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_mad_u16 v0, s10, v4, v0 +; GFX8-NEXT: v_mad_u16 v0, s10, v2, v0 ; GFX8-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -7748,8 +7749,8 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX9-NEXT: s_load_dword s10, s[4:5], 0x34 ; GFX9-NEXT: s_mov_b64 s[6:7], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX9-NEXT: s_cbranch_execz .LBB13_4 @@ -7768,26 +7769,26 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX9-NEXT: s_lshl_b32 s14, s2, s11 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_add_u32_e32 v0, s14, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-NEXT: v_add_u32_e32 v0, s14, v4 ; GFX9-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX9-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX9-NEXT: v_and_or_b32 v3, v4, s13, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB13_2 ; GFX9-NEXT: ; %bb.3: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: v_lshrrev_b32_e32 v0, s11, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, s11, v0 ; GFX9-NEXT: .LBB13_4: ; %Flow ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -7796,7 +7797,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_mad_legacy_u16 v0, s10, v4, v0 +; GFX9-NEXT: v_mad_legacy_u16 v0, s10, v2, v0 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -7807,9 +7808,9 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1064-NEXT: s_load_dword s10, s[4:5], 0x34 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB13_4 ; GFX1064-NEXT: ; %bb.1: @@ -7828,32 +7829,32 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s3 +; GFX1064-NEXT: v_mov_b32_e32 v0, s3 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_add_nc_u32_e32 v0, s14, v1 +; GFX1064-NEXT: v_mov_b32_e32 v4, v0 +; GFX1064-NEXT: v_add_nc_u32_e32 v0, s14, v4 ; GFX1064-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX1064-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1064-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX1064-NEXT: v_and_or_b32 v3, v4, s13, v0 +; GFX1064-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v2 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1064-NEXT: ; %bb.3: ; %atomicrmw.end ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s11, v2 +; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s11, v0 ; GFX1064-NEXT: .LBB13_4: ; %Flow ; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1064-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_mad_u16 v0, s10, v4, s2 +; GFX1064-NEXT: v_mad_u16 v0, s10, v2, s2 ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm @@ -7865,9 +7866,9 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1032-NEXT: s_load_dword s8, s[4:5], 0x34 ; GFX1032-NEXT: s_mov_b32 s6, exec_lo ; GFX1032-NEXT: s_mov_b32 s10, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v4, s6, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: s_and_saveexec_b32 s9, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB13_4 ; GFX1032-NEXT: ; %bb.1: @@ -7885,32 +7886,32 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1032-NEXT: s_lshl_b32 s12, s6, s2 ; GFX1032-NEXT: s_mov_b32 s6, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s7 +; GFX1032-NEXT: v_mov_b32_e32 v0, s7 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_add_nc_u32_e32 v0, s12, v1 +; GFX1032-NEXT: v_mov_b32_e32 v4, v0 +; GFX1032-NEXT: v_add_nc_u32_e32 v0, s12, v4 ; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX1032-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX1032-NEXT: v_and_or_b32 v3, v4, s11, v0 +; GFX1032-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v2 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX1032-NEXT: s_or_b32 s10, vcc_lo, s10 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s10 ; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1032-NEXT: ; %bb.3: ; %atomicrmw.end ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s10 -; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v2 +; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX1032-NEXT: .LBB13_4: ; %Flow ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX1032-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_mad_u16 v0, s8, v4, s2 +; GFX1032-NEXT: v_mad_u16 v0, s8, v2, s2 ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm @@ -7924,9 +7925,9 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1164-TRUE16-NEXT: s_mov_b64 s[8:9], exec ; GFX1164-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0 +; GFX1164-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1164-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 -; GFX1164-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1164-TRUE16-NEXT: s_cbranch_execz .LBB13_4 ; GFX1164-TRUE16-NEXT: ; %bb.1: ; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -7944,29 +7945,29 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1164-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, s3 +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v0, s3 ; GFX1164-TRUE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-TRUE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-TRUE16-NEXT: v_add_nc_u32_e32 v0, s14, v4 +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX1164-TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-TRUE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc +; GFX1164-TRUE16-NEXT: v_and_or_b32 v3, v4, s13, v0 +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc ; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX1164-TRUE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-TRUE16-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1164-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end ; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2 +; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v0 ; GFX1164-TRUE16-NEXT: .LBB13_4: ; %Flow ; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -7975,7 +7976,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1164-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_mad_u16 v0.l, s10, v4.l, s2 +; GFX1164-TRUE16-NEXT: v_mad_u16 v0.l, s10, v2.l, s2 ; GFX1164-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1164-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 ; GFX1164-TRUE16-NEXT: s_endpgm @@ -7989,9 +7990,9 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1164-FAKE16-NEXT: s_mov_b64 s[8:9], exec ; GFX1164-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0 +; GFX1164-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1164-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1164-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1164-FAKE16-NEXT: s_cbranch_execz .LBB13_4 ; GFX1164-FAKE16-NEXT: ; %bb.1: ; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -8009,29 +8010,29 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1164-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, s3 +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v0, s3 ; GFX1164-FAKE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-FAKE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1164-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-FAKE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-FAKE16-NEXT: v_add_nc_u32_e32 v0, s14, v4 +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-FAKE16-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX1164-FAKE16-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-FAKE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc +; GFX1164-FAKE16-NEXT: v_and_or_b32 v3, v4, s13, v0 +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc ; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX1164-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-FAKE16-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1164-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end ; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2 +; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s11, v0 ; GFX1164-FAKE16-NEXT: .LBB13_4: ; %Flow ; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -8040,7 +8041,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1164-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-FAKE16-NEXT: v_mad_u16 v0, s10, v4, s2 +; GFX1164-FAKE16-NEXT: v_mad_u16 v0, s10, v2, s2 ; GFX1164-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1164-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 ; GFX1164-FAKE16-NEXT: s_endpgm @@ -8052,11 +8053,11 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1132-TRUE16-NEXT: s_load_b32 s8, s[4:5], 0x34 ; GFX1132-TRUE16-NEXT: s_mov_b32 s6, exec_lo ; GFX1132-TRUE16-NEXT: s_mov_b32 s10, 0 -; GFX1132-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v4, s6, 0 +; GFX1132-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 ; GFX1132-TRUE16-NEXT: s_mov_b32 s9, exec_lo ; GFX1132-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-TRUE16-NEXT: s_cbranch_execz .LBB13_4 ; GFX1132-TRUE16-NEXT: ; %bb.1: ; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -8073,27 +8074,27 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1132-TRUE16-NEXT: s_lshl_b32 s12, s6, s2 ; GFX1132-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, s7 +; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v0, s7 ; GFX1132-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132-TRUE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 -; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-TRUE16-NEXT: v_add_nc_u32_e32 v0, s12, v4 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc +; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1132-TRUE16-NEXT: v_and_or_b32 v3, v4, s11, v0 +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc ; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX1132-TRUE16-NEXT: s_or_b32 s10, vcc_lo, s10 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s10 ; GFX1132-TRUE16-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1132-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end ; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s10 -; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 +; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX1132-TRUE16-NEXT: .LBB13_4: ; %Flow ; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -8102,7 +8103,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_mad_u16 v0.l, s8, v4.l, s2 +; GFX1132-TRUE16-NEXT: v_mad_u16 v0.l, s8, v2.l, s2 ; GFX1132-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1132-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 ; GFX1132-TRUE16-NEXT: s_endpgm @@ -8114,11 +8115,11 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1132-FAKE16-NEXT: s_load_b32 s8, s[4:5], 0x34 ; GFX1132-FAKE16-NEXT: s_mov_b32 s6, exec_lo ; GFX1132-FAKE16-NEXT: s_mov_b32 s10, 0 -; GFX1132-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v4, s6, 0 +; GFX1132-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 ; GFX1132-FAKE16-NEXT: s_mov_b32 s9, exec_lo ; GFX1132-FAKE16-NEXT: ; implicit-def: $vgpr0 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-FAKE16-NEXT: s_cbranch_execz .LBB13_4 ; GFX1132-FAKE16-NEXT: ; %bb.1: ; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -8135,27 +8136,27 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1132-FAKE16-NEXT: s_lshl_b32 s12, s6, s2 ; GFX1132-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, s7 +; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v0, s7 ; GFX1132-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132-FAKE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 -; GFX1132-FAKE16-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-FAKE16-NEXT: v_add_nc_u32_e32 v0, s12, v4 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc +; GFX1132-FAKE16-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1132-FAKE16-NEXT: v_and_or_b32 v3, v4, s11, v0 +; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc ; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX1132-FAKE16-NEXT: s_or_b32 s10, vcc_lo, s10 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s10 ; GFX1132-FAKE16-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1132-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end ; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s10 -; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 +; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX1132-FAKE16-NEXT: .LBB13_4: ; %Flow ; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -8164,7 +8165,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1132-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_mad_u16 v0, s8, v4, s2 +; GFX1132-FAKE16-NEXT: v_mad_u16 v0, s8, v2, s2 ; GFX1132-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1132-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 ; GFX1132-FAKE16-NEXT: s_endpgm @@ -8178,9 +8179,9 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1264-TRUE16-NEXT: s_mov_b64 s[8:9], exec ; GFX1264-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0 +; GFX1264-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1264-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 -; GFX1264-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1264-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1264-TRUE16-NEXT: s_cbranch_execz .LBB13_4 ; GFX1264-TRUE16-NEXT: ; %bb.1: ; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 @@ -8199,28 +8200,29 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1264-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1264-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, s3 +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v0, s3 ; GFX1264-TRUE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1264-TRUE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v4, v0 +; GFX1264-TRUE16-NEXT: v_add_nc_u32_e32 v0, s14, v4 +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264-TRUE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0 -; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1264-TRUE16-NEXT: v_and_or_b32 v3, v4, s13, v0 +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1264-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1264-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX1264-TRUE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1264-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX1264-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1264-TRUE16-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1264-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end ; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2 +; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v0 ; GFX1264-TRUE16-NEXT: .LBB13_4: ; %Flow ; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -8230,7 +8232,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1264-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-TRUE16-NEXT: s_wait_alu 0xf1ff ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_mad_u16 v0.l, s10, v4.l, s2 +; GFX1264-TRUE16-NEXT: v_mad_u16 v0.l, s10, v2.l, s2 ; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1264-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], null ; GFX1264-TRUE16-NEXT: s_endpgm @@ -8244,9 +8246,9 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1264-FAKE16-NEXT: s_mov_b64 s[8:9], exec ; GFX1264-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0 +; GFX1264-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1264-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1264-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1264-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1264-FAKE16-NEXT: s_cbranch_execz .LBB13_4 ; GFX1264-FAKE16-NEXT: ; %bb.1: ; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 @@ -8265,28 +8267,29 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1264-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1264-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, s3 +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v0, s3 ; GFX1264-FAKE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1264-FAKE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1264-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-FAKE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v4, v0 +; GFX1264-FAKE16-NEXT: v_add_nc_u32_e32 v0, s14, v4 +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-FAKE16-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264-FAKE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0 -; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1264-FAKE16-NEXT: v_and_or_b32 v3, v4, s13, v0 +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1264-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1264-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX1264-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1264-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX1264-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1264-FAKE16-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1264-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end ; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2 +; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s11, v0 ; GFX1264-FAKE16-NEXT: .LBB13_4: ; %Flow ; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -8296,7 +8299,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1264-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-FAKE16-NEXT: v_mad_u16 v0, s10, v4, s2 +; GFX1264-FAKE16-NEXT: v_mad_u16 v0, s10, v2, s2 ; GFX1264-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1264-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], null ; GFX1264-FAKE16-NEXT: s_endpgm @@ -8308,11 +8311,11 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1232-TRUE16-NEXT: s_load_b32 s8, s[4:5], 0x34 ; GFX1232-TRUE16-NEXT: s_mov_b32 s6, exec_lo ; GFX1232-TRUE16-NEXT: s_mov_b32 s10, 0 -; GFX1232-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v4, s6, 0 +; GFX1232-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 ; GFX1232-TRUE16-NEXT: s_mov_b32 s9, exec_lo ; GFX1232-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1232-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1232-TRUE16-NEXT: s_cbranch_execz .LBB13_4 ; GFX1232-TRUE16-NEXT: ; %bb.1: ; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 @@ -8332,27 +8335,28 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1232-TRUE16-NEXT: s_lshl_b32 s12, s6, s2 ; GFX1232-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, s7 +; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v0, s7 ; GFX1232-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1232-TRUE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 -; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v4, v0 +; GFX1232-TRUE16-NEXT: v_add_nc_u32_e32 v0, s12, v4 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1232-TRUE16-NEXT: v_and_or_b32 v3, v4, s11, v0 +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX1232-TRUE16-NEXT: s_or_b32 s10, vcc_lo, s10 ; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX1232-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s10 ; GFX1232-TRUE16-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1232-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end ; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s10 -; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 +; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX1232-TRUE16-NEXT: .LBB13_4: ; %Flow ; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -8362,7 +8366,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1232-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-TRUE16-NEXT: s_wait_alu 0xf1ff ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_mad_u16 v0.l, s8, v4.l, s2 +; GFX1232-TRUE16-NEXT: v_mad_u16 v0.l, s8, v2.l, s2 ; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1232-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], null ; GFX1232-TRUE16-NEXT: s_endpgm @@ -8374,11 +8378,11 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1232-FAKE16-NEXT: s_load_b32 s8, s[4:5], 0x34 ; GFX1232-FAKE16-NEXT: s_mov_b32 s6, exec_lo ; GFX1232-FAKE16-NEXT: s_mov_b32 s10, 0 -; GFX1232-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v4, s6, 0 +; GFX1232-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 ; GFX1232-FAKE16-NEXT: s_mov_b32 s9, exec_lo ; GFX1232-FAKE16-NEXT: ; implicit-def: $vgpr0 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1232-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1232-FAKE16-NEXT: s_cbranch_execz .LBB13_4 ; GFX1232-FAKE16-NEXT: ; %bb.1: ; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 @@ -8398,27 +8402,28 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1232-FAKE16-NEXT: s_lshl_b32 s12, s6, s2 ; GFX1232-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, s7 +; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v0, s7 ; GFX1232-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1232-FAKE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1232-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 -; GFX1232-FAKE16-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v4, v0 +; GFX1232-FAKE16-NEXT: v_add_nc_u32_e32 v0, s12, v4 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1232-FAKE16-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1232-FAKE16-NEXT: v_and_or_b32 v3, v4, s11, v0 +; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX1232-FAKE16-NEXT: s_or_b32 s10, vcc_lo, s10 ; GFX1232-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX1232-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s10 ; GFX1232-FAKE16-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1232-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end ; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s10 -; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 +; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX1232-FAKE16-NEXT: .LBB13_4: ; %Flow ; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -8428,7 +8433,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1232-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_mad_u16 v0, s8, v4, s2 +; GFX1232-FAKE16-NEXT: v_mad_u16 v0, s8, v2, s2 ; GFX1232-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1232-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], null ; GFX1232-FAKE16-NEXT: s_endpgm @@ -8455,27 +8460,28 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa ; GFX7LESS-NEXT: s_lshl_b32 s2, s3, s10 ; GFX7LESS-NEXT: s_not_b32 s3, s11 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_and_b32_e32 v0, s3, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_and_b32_e32 v0, s3, v2 +; GFX7LESS-NEXT: v_or_b32_e32 v1, s2, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX7LESS-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB14_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s10, v0 ; GFX7LESS-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -8495,27 +8501,27 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa ; GFX8-NEXT: s_lshl_b32 s10, s2, s8 ; GFX8-NEXT: s_mov_b64 s[2:3], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_and_b32_e32 v0, s9, v1 -; GFX8-NEXT: v_or_b32_e32 v0, s10, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX8-NEXT: v_and_b32_e32 v0, s9, v2 +; GFX8-NEXT: v_or_b32_e32 v1, s10, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s8, v0 ; GFX8-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -8535,27 +8541,27 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa ; GFX9-NEXT: s_lshl_b32 s10, s2, s8 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_and_b32_e32 v0, s9, v1 -; GFX9-NEXT: v_or_b32_e32 v0, s10, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX9-NEXT: v_and_b32_e32 v0, s9, v2 +; GFX9-NEXT: v_or_b32_e32 v1, s10, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB14_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, s8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, s8, v0 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -8577,23 +8583,23 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa ; GFX1064-NEXT: s_not_b32 s9, s2 ; GFX1064-NEXT: s_mov_b32 s6, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s3 +; GFX1064-NEXT: v_mov_b32_e32 v0, s3 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_and_or_b32 v0, v1, s9, s10 -; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1064-NEXT: v_and_or_b32 v1, v2, s9, s10 +; GFX1064-NEXT: v_mov_b32_e32 v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v2 +; GFX1064-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB14_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s8, v2 +; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s8, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -8617,23 +8623,23 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa ; GFX1032-NEXT: s_not_b32 s3, s3 ; GFX1032-NEXT: s_mov_b32 s6, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s7 +; GFX1032-NEXT: v_mov_b32_e32 v0, s7 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_and_or_b32 v0, v1, s3, s8 -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX1032-NEXT: v_and_or_b32 v1, v2, s3, s8 +; GFX1032-NEXT: v_mov_b32_e32 v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v2 +; GFX1032-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 ; GFX1032-NEXT: s_or_b32 s9, vcc_lo, s9 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; GFX1032-NEXT: s_cbranch_execnz .LBB14_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v2 +; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -8657,25 +8663,26 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa ; GFX1164-NEXT: s_not_b32 s9, s2 ; GFX1164-NEXT: s_mov_b32 s6, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s3 +; GFX1164-NEXT: v_mov_b32_e32 v0, s3 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_and_or_b32 v0, v1, s9, s10 -; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1164-NEXT: v_and_or_b32 v1, v2, s9, s10 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v2 +; GFX1164-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB14_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: v_lshrrev_b32_e32 v0, s8, v2 +; GFX1164-NEXT: v_lshrrev_b32_e32 v0, s8, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b8 v0, off, s[0:3], 0 @@ -8699,24 +8706,26 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa ; GFX1132-NEXT: s_not_b32 s3, s3 ; GFX1132-NEXT: s_mov_b32 s6, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s7 +; GFX1132-NEXT: v_mov_b32_e32 v0, s7 ; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_and_or_b32 v0, v1, s3, s8 -; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX1132-NEXT: v_mov_b32_e32 v2, v0 +; GFX1132-NEXT: v_and_or_b32 v1, v2, s3, s8 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v2 +; GFX1132-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 ; GFX1132-NEXT: s_or_b32 s9, vcc_lo, s9 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s9 ; GFX1132-NEXT: s_cbranch_execnz .LBB14_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1132-NEXT: v_lshrrev_b32_e32 v0, s2, v2 +; GFX1132-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b8 v0, off, s[0:3], 0 @@ -8740,25 +8749,26 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa ; GFX1264-NEXT: s_not_b32 s9, s2 ; GFX1264-NEXT: s_mov_b32 s6, -1 ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: v_mov_b32_e32 v1, s3 +; GFX1264-NEXT: v_mov_b32_e32 v0, s3 ; GFX1264-NEXT: s_mov_b64 s[2:3], 0 ; GFX1264-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264-NEXT: v_and_or_b32 v0, v1, s9, s10 -; GFX1264-NEXT: v_mov_b32_e32 v3, v1 +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mov_b32_e32 v2, v0 -; GFX1264-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1264-NEXT: s_wait_loadcnt 0x0 -; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1264-NEXT: v_and_or_b32 v1, v2, s9, s10 +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264-NEXT: v_mov_b32_e32 v0, v1 ; GFX1264-NEXT: v_mov_b32_e32 v1, v2 +; GFX1264-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1264-NEXT: s_wait_loadcnt 0x0 +; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX1264-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1264-NEXT: s_cbranch_execnz .LBB14_1 ; GFX1264-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1264-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1264-NEXT: v_lshrrev_b32_e32 v0, s8, v2 +; GFX1264-NEXT: v_lshrrev_b32_e32 v0, s8, v0 ; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-NEXT: s_mov_b32 s2, -1 ; GFX1264-NEXT: buffer_store_b8 v0, off, s[0:3], null @@ -8782,24 +8792,26 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa ; GFX1232-NEXT: s_not_b32 s3, s3 ; GFX1232-NEXT: s_mov_b32 s6, -1 ; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: v_mov_b32_e32 v1, s7 +; GFX1232-NEXT: v_mov_b32_e32 v0, s7 ; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1232-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-NEXT: v_and_or_b32 v0, v1, s3, s8 -; GFX1232-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1232-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1232-NEXT: s_wait_loadcnt 0x0 -; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX1232-NEXT: v_mov_b32_e32 v2, v0 +; GFX1232-NEXT: v_and_or_b32 v1, v2, s3, s8 +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232-NEXT: v_mov_b32_e32 v0, v1 ; GFX1232-NEXT: v_mov_b32_e32 v1, v2 +; GFX1232-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1232-NEXT: s_wait_loadcnt 0x0 +; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 ; GFX1232-NEXT: s_or_b32 s9, vcc_lo, s9 ; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_and_not1_b32 exec_lo, exec_lo, s9 ; GFX1232-NEXT: s_cbranch_execnz .LBB14_1 ; GFX1232-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1232-NEXT: v_lshrrev_b32_e32 v0, s2, v2 +; GFX1232-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-NEXT: s_mov_b32 s2, -1 ; GFX1232-NEXT: buffer_store_b8 v0, off, s[0:3], null @@ -9321,8 +9333,8 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7LESS-NEXT: s_load_dword s10, s[4:5], 0xd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v4, s7, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB16_4 @@ -9342,27 +9354,28 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX7LESS-NEXT: s_not_b32 s13, s12 ; GFX7LESS-NEXT: s_lshl_b32 s14, s6, s11 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s15 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s15 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s14, v1 -; GFX7LESS-NEXT: v_and_b32_e32 v0, s12, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_and_b32_e32 v2, s13, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s14, v4 +; GFX7LESS-NEXT: v_and_b32_e32 v0, s12, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v1, s13, v4 +; GFX7LESS-NEXT: v_or_b32_e32 v3, v1, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB16_2 ; GFX7LESS-NEXT: ; %bb.3: ; %atomicrmw.end ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX7LESS-NEXT: v_bfe_u32 v0, v2, s11, 16 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_bfe_u32 v0, v0, s11, 16 ; GFX7LESS-NEXT: .LBB16_4: ; %Flow ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -9372,7 +9385,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 ; GFX7LESS-NEXT: s_and_b32 s5, s10, 0xffff ; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 -; GFX7LESS-NEXT: v_mad_u32_u24 v0, s5, v4, v0 +; GFX7LESS-NEXT: v_mad_u32_u24 v0, s5, v2, v0 ; GFX7LESS-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -9382,8 +9395,8 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX8-NEXT: s_load_dword s10, s[4:5], 0x34 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX8-NEXT: s_cbranch_execz .LBB16_4 @@ -9402,27 +9415,27 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX8-NEXT: s_lshl_b32 s14, s2, s11 ; GFX8-NEXT: s_mov_b64 s[2:3], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s14, v1 -; GFX8-NEXT: v_and_b32_e32 v2, s13, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s14, v4 +; GFX8-NEXT: v_and_b32_e32 v1, s13, v4 ; GFX8-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX8-NEXT: v_or_b32_e32 v3, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_cbranch_execnz .LBB16_2 ; GFX8-NEXT: ; %bb.3: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s11, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s11, v0 ; GFX8-NEXT: .LBB16_4: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -9431,7 +9444,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_mad_u16 v0, s10, v4, v0 +; GFX8-NEXT: v_mad_u16 v0, s10, v2, v0 ; GFX8-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -9441,8 +9454,8 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX9-NEXT: s_load_dword s10, s[4:5], 0x34 ; GFX9-NEXT: s_mov_b64 s[6:7], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX9-NEXT: s_cbranch_execz .LBB16_4 @@ -9461,26 +9474,26 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX9-NEXT: s_lshl_b32 s14, s2, s11 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_add_u32_e32 v0, s14, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-NEXT: v_add_u32_e32 v0, s14, v4 ; GFX9-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX9-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX9-NEXT: v_and_or_b32 v3, v4, s13, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB16_2 ; GFX9-NEXT: ; %bb.3: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: v_lshrrev_b32_e32 v0, s11, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, s11, v0 ; GFX9-NEXT: .LBB16_4: ; %Flow ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -9489,7 +9502,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_mad_legacy_u16 v0, s10, v4, v0 +; GFX9-NEXT: v_mad_legacy_u16 v0, s10, v2, v0 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -9500,9 +9513,9 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1064-NEXT: s_load_dword s10, s[4:5], 0x34 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB16_4 ; GFX1064-NEXT: ; %bb.1: @@ -9521,32 +9534,32 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s3 +; GFX1064-NEXT: v_mov_b32_e32 v0, s3 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_add_nc_u32_e32 v0, s14, v1 +; GFX1064-NEXT: v_mov_b32_e32 v4, v0 +; GFX1064-NEXT: v_add_nc_u32_e32 v0, s14, v4 ; GFX1064-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX1064-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1064-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX1064-NEXT: v_and_or_b32 v3, v4, s13, v0 +; GFX1064-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v2 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1064-NEXT: ; %bb.3: ; %atomicrmw.end ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s11, v2 +; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s11, v0 ; GFX1064-NEXT: .LBB16_4: ; %Flow ; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1064-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_mad_u16 v0, s10, v4, s2 +; GFX1064-NEXT: v_mad_u16 v0, s10, v2, s2 ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm @@ -9558,9 +9571,9 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1032-NEXT: s_load_dword s8, s[4:5], 0x34 ; GFX1032-NEXT: s_mov_b32 s6, exec_lo ; GFX1032-NEXT: s_mov_b32 s10, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v4, s6, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: s_and_saveexec_b32 s9, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB16_4 ; GFX1032-NEXT: ; %bb.1: @@ -9578,32 +9591,32 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1032-NEXT: s_lshl_b32 s12, s6, s2 ; GFX1032-NEXT: s_mov_b32 s6, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s7 +; GFX1032-NEXT: v_mov_b32_e32 v0, s7 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_add_nc_u32_e32 v0, s12, v1 +; GFX1032-NEXT: v_mov_b32_e32 v4, v0 +; GFX1032-NEXT: v_add_nc_u32_e32 v0, s12, v4 ; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX1032-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX1032-NEXT: v_and_or_b32 v3, v4, s11, v0 +; GFX1032-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v2 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX1032-NEXT: s_or_b32 s10, vcc_lo, s10 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s10 ; GFX1032-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1032-NEXT: ; %bb.3: ; %atomicrmw.end ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s10 -; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v2 +; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX1032-NEXT: .LBB16_4: ; %Flow ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX1032-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_mad_u16 v0, s8, v4, s2 +; GFX1032-NEXT: v_mad_u16 v0, s8, v2, s2 ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm @@ -9617,9 +9630,9 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1164-TRUE16-NEXT: s_mov_b64 s[8:9], exec ; GFX1164-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0 +; GFX1164-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1164-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 -; GFX1164-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1164-TRUE16-NEXT: s_cbranch_execz .LBB16_4 ; GFX1164-TRUE16-NEXT: ; %bb.1: ; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -9637,29 +9650,29 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1164-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, s3 +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v0, s3 ; GFX1164-TRUE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-TRUE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-TRUE16-NEXT: v_add_nc_u32_e32 v0, s14, v4 +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX1164-TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-TRUE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc +; GFX1164-TRUE16-NEXT: v_and_or_b32 v3, v4, s13, v0 +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc ; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX1164-TRUE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-TRUE16-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1164-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end ; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2 +; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v0 ; GFX1164-TRUE16-NEXT: .LBB16_4: ; %Flow ; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -9668,7 +9681,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1164-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_mad_u16 v0.l, s10, v4.l, s2 +; GFX1164-TRUE16-NEXT: v_mad_u16 v0.l, s10, v2.l, s2 ; GFX1164-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1164-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; GFX1164-TRUE16-NEXT: s_endpgm @@ -9682,9 +9695,9 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1164-FAKE16-NEXT: s_mov_b64 s[8:9], exec ; GFX1164-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0 +; GFX1164-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1164-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1164-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1164-FAKE16-NEXT: s_cbranch_execz .LBB16_4 ; GFX1164-FAKE16-NEXT: ; %bb.1: ; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -9702,29 +9715,29 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1164-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, s3 +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v0, s3 ; GFX1164-FAKE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-FAKE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1164-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-FAKE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v4, v0 +; GFX1164-FAKE16-NEXT: v_add_nc_u32_e32 v0, s14, v4 +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-FAKE16-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX1164-FAKE16-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-FAKE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc +; GFX1164-FAKE16-NEXT: v_and_or_b32 v3, v4, s13, v0 +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc ; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX1164-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-FAKE16-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1164-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end ; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2 +; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s11, v0 ; GFX1164-FAKE16-NEXT: .LBB16_4: ; %Flow ; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -9733,7 +9746,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1164-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-FAKE16-NEXT: v_mad_u16 v0, s10, v4, s2 +; GFX1164-FAKE16-NEXT: v_mad_u16 v0, s10, v2, s2 ; GFX1164-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1164-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; GFX1164-FAKE16-NEXT: s_endpgm @@ -9745,11 +9758,11 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1132-TRUE16-NEXT: s_load_b32 s8, s[4:5], 0x34 ; GFX1132-TRUE16-NEXT: s_mov_b32 s6, exec_lo ; GFX1132-TRUE16-NEXT: s_mov_b32 s10, 0 -; GFX1132-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v4, s6, 0 +; GFX1132-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 ; GFX1132-TRUE16-NEXT: s_mov_b32 s9, exec_lo ; GFX1132-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-TRUE16-NEXT: s_cbranch_execz .LBB16_4 ; GFX1132-TRUE16-NEXT: ; %bb.1: ; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -9766,27 +9779,27 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1132-TRUE16-NEXT: s_lshl_b32 s12, s6, s2 ; GFX1132-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, s7 +; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v0, s7 ; GFX1132-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132-TRUE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 -; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-TRUE16-NEXT: v_add_nc_u32_e32 v0, s12, v4 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc +; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1132-TRUE16-NEXT: v_and_or_b32 v3, v4, s11, v0 +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc ; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX1132-TRUE16-NEXT: s_or_b32 s10, vcc_lo, s10 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s10 ; GFX1132-TRUE16-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1132-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end ; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s10 -; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 +; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX1132-TRUE16-NEXT: .LBB16_4: ; %Flow ; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -9795,7 +9808,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_mad_u16 v0.l, s8, v4.l, s2 +; GFX1132-TRUE16-NEXT: v_mad_u16 v0.l, s8, v2.l, s2 ; GFX1132-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1132-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; GFX1132-TRUE16-NEXT: s_endpgm @@ -9807,11 +9820,11 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1132-FAKE16-NEXT: s_load_b32 s8, s[4:5], 0x34 ; GFX1132-FAKE16-NEXT: s_mov_b32 s6, exec_lo ; GFX1132-FAKE16-NEXT: s_mov_b32 s10, 0 -; GFX1132-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v4, s6, 0 +; GFX1132-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 ; GFX1132-FAKE16-NEXT: s_mov_b32 s9, exec_lo ; GFX1132-FAKE16-NEXT: ; implicit-def: $vgpr0 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-FAKE16-NEXT: s_cbranch_execz .LBB16_4 ; GFX1132-FAKE16-NEXT: ; %bb.1: ; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -9828,27 +9841,27 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1132-FAKE16-NEXT: s_lshl_b32 s12, s6, s2 ; GFX1132-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, s7 +; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v0, s7 ; GFX1132-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132-FAKE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1132-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 -; GFX1132-FAKE16-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v4, v0 +; GFX1132-FAKE16-NEXT: v_add_nc_u32_e32 v0, s12, v4 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc +; GFX1132-FAKE16-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1132-FAKE16-NEXT: v_and_or_b32 v3, v4, s11, v0 +; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc ; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX1132-FAKE16-NEXT: s_or_b32 s10, vcc_lo, s10 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s10 ; GFX1132-FAKE16-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1132-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end ; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s10 -; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 +; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX1132-FAKE16-NEXT: .LBB16_4: ; %Flow ; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -9857,7 +9870,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1132-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_mad_u16 v0, s8, v4, s2 +; GFX1132-FAKE16-NEXT: v_mad_u16 v0, s8, v2, s2 ; GFX1132-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1132-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; GFX1132-FAKE16-NEXT: s_endpgm @@ -9871,9 +9884,9 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1264-TRUE16-NEXT: s_mov_b64 s[8:9], exec ; GFX1264-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0 +; GFX1264-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1264-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 -; GFX1264-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1264-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1264-TRUE16-NEXT: s_cbranch_execz .LBB16_4 ; GFX1264-TRUE16-NEXT: ; %bb.1: ; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 @@ -9892,28 +9905,29 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1264-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1264-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, s3 +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v0, s3 ; GFX1264-TRUE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1264-TRUE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v4, v0 +; GFX1264-TRUE16-NEXT: v_add_nc_u32_e32 v0, s14, v4 +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264-TRUE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0 -; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1264-TRUE16-NEXT: v_and_or_b32 v3, v4, s13, v0 +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1264-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1264-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX1264-TRUE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1264-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX1264-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1264-TRUE16-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1264-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end ; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2 +; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v0 ; GFX1264-TRUE16-NEXT: .LBB16_4: ; %Flow ; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -9923,7 +9937,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1264-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-TRUE16-NEXT: s_wait_alu 0xf1ff ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_mad_u16 v0.l, s10, v4.l, s2 +; GFX1264-TRUE16-NEXT: v_mad_u16 v0.l, s10, v2.l, s2 ; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1264-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null ; GFX1264-TRUE16-NEXT: s_endpgm @@ -9937,9 +9951,9 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1264-FAKE16-NEXT: s_mov_b64 s[8:9], exec ; GFX1264-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0 +; GFX1264-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1264-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1264-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1264-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1264-FAKE16-NEXT: s_cbranch_execz .LBB16_4 ; GFX1264-FAKE16-NEXT: ; %bb.1: ; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 @@ -9958,28 +9972,29 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1264-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1264-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, s3 +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v0, s3 ; GFX1264-FAKE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1264-FAKE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1264-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-FAKE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v4, v0 +; GFX1264-FAKE16-NEXT: v_add_nc_u32_e32 v0, s14, v4 +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-FAKE16-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264-FAKE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0 -; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1264-FAKE16-NEXT: v_and_or_b32 v3, v4, s13, v0 +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1264-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1264-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX1264-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1264-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX1264-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1264-FAKE16-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1264-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end ; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2 +; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s11, v0 ; GFX1264-FAKE16-NEXT: .LBB16_4: ; %Flow ; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -9989,7 +10004,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1264-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-FAKE16-NEXT: v_mad_u16 v0, s10, v4, s2 +; GFX1264-FAKE16-NEXT: v_mad_u16 v0, s10, v2, s2 ; GFX1264-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1264-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null ; GFX1264-FAKE16-NEXT: s_endpgm @@ -10001,11 +10016,11 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1232-TRUE16-NEXT: s_load_b32 s8, s[4:5], 0x34 ; GFX1232-TRUE16-NEXT: s_mov_b32 s6, exec_lo ; GFX1232-TRUE16-NEXT: s_mov_b32 s10, 0 -; GFX1232-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v4, s6, 0 +; GFX1232-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 ; GFX1232-TRUE16-NEXT: s_mov_b32 s9, exec_lo ; GFX1232-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1232-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1232-TRUE16-NEXT: s_cbranch_execz .LBB16_4 ; GFX1232-TRUE16-NEXT: ; %bb.1: ; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 @@ -10025,27 +10040,28 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1232-TRUE16-NEXT: s_lshl_b32 s12, s6, s2 ; GFX1232-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, s7 +; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v0, s7 ; GFX1232-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1232-TRUE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 -; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v4, v0 +; GFX1232-TRUE16-NEXT: v_add_nc_u32_e32 v0, s12, v4 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1232-TRUE16-NEXT: v_and_or_b32 v3, v4, s11, v0 +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX1232-TRUE16-NEXT: s_or_b32 s10, vcc_lo, s10 ; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX1232-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s10 ; GFX1232-TRUE16-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1232-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end ; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s10 -; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 +; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX1232-TRUE16-NEXT: .LBB16_4: ; %Flow ; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -10055,7 +10071,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1232-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-TRUE16-NEXT: s_wait_alu 0xf1ff ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_mad_u16 v0.l, s8, v4.l, s2 +; GFX1232-TRUE16-NEXT: v_mad_u16 v0.l, s8, v2.l, s2 ; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1232-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null ; GFX1232-TRUE16-NEXT: s_endpgm @@ -10067,11 +10083,11 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1232-FAKE16-NEXT: s_load_b32 s8, s[4:5], 0x34 ; GFX1232-FAKE16-NEXT: s_mov_b32 s6, exec_lo ; GFX1232-FAKE16-NEXT: s_mov_b32 s10, 0 -; GFX1232-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v4, s6, 0 +; GFX1232-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 ; GFX1232-FAKE16-NEXT: s_mov_b32 s9, exec_lo ; GFX1232-FAKE16-NEXT: ; implicit-def: $vgpr0 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1232-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1232-FAKE16-NEXT: s_cbranch_execz .LBB16_4 ; GFX1232-FAKE16-NEXT: ; %bb.1: ; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 @@ -10091,27 +10107,28 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1232-FAKE16-NEXT: s_lshl_b32 s12, s6, s2 ; GFX1232-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, s7 +; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v0, s7 ; GFX1232-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1232-FAKE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1232-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 -; GFX1232-FAKE16-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v4, v0 +; GFX1232-FAKE16-NEXT: v_add_nc_u32_e32 v0, s12, v4 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1232-FAKE16-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1232-FAKE16-NEXT: v_and_or_b32 v3, v4, s11, v0 +; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX1232-FAKE16-NEXT: s_or_b32 s10, vcc_lo, s10 ; GFX1232-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX1232-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s10 ; GFX1232-FAKE16-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1232-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end ; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s10 -; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 +; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX1232-FAKE16-NEXT: .LBB16_4: ; %Flow ; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -10121,7 +10138,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1232-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_mad_u16 v0, s8, v4, s2 +; GFX1232-FAKE16-NEXT: v_mad_u16 v0, s8, v2, s2 ; GFX1232-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1232-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null ; GFX1232-FAKE16-NEXT: s_endpgm @@ -10148,27 +10165,28 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX7LESS-NEXT: s_lshl_b32 s2, s3, s10 ; GFX7LESS-NEXT: s_not_b32 s3, s11 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_and_b32_e32 v0, s3, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_and_b32_e32 v0, s3, v2 +; GFX7LESS-NEXT: v_or_b32_e32 v1, s2, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX7LESS-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s10, v0 ; GFX7LESS-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -10188,27 +10206,27 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX8-NEXT: s_lshl_b32 s10, s2, s8 ; GFX8-NEXT: s_mov_b64 s[2:3], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_and_b32_e32 v0, s9, v1 -; GFX8-NEXT: v_or_b32_e32 v0, s10, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX8-NEXT: v_and_b32_e32 v0, s9, v2 +; GFX8-NEXT: v_or_b32_e32 v1, s10, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s8, v0 ; GFX8-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -10228,27 +10246,27 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX9-NEXT: s_lshl_b32 s10, s2, s8 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_and_b32_e32 v0, s9, v1 -; GFX9-NEXT: v_or_b32_e32 v0, s10, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX9-NEXT: v_and_b32_e32 v0, s9, v2 +; GFX9-NEXT: v_or_b32_e32 v1, s10, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB17_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, s8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, s8, v0 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -10270,23 +10288,23 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX1064-NEXT: s_not_b32 s9, s2 ; GFX1064-NEXT: s_mov_b32 s6, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s3 +; GFX1064-NEXT: v_mov_b32_e32 v0, s3 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_and_or_b32 v0, v1, s9, s10 -; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1064-NEXT: v_and_or_b32 v1, v2, s9, s10 +; GFX1064-NEXT: v_mov_b32_e32 v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v2 +; GFX1064-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s8, v2 +; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s8, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -10310,23 +10328,23 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX1032-NEXT: s_not_b32 s3, s3 ; GFX1032-NEXT: s_mov_b32 s6, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s7 +; GFX1032-NEXT: v_mov_b32_e32 v0, s7 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_and_or_b32 v0, v1, s3, s8 -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX1032-NEXT: v_and_or_b32 v1, v2, s3, s8 +; GFX1032-NEXT: v_mov_b32_e32 v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v2 +; GFX1032-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 ; GFX1032-NEXT: s_or_b32 s9, vcc_lo, s9 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; GFX1032-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v2 +; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -10350,25 +10368,26 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX1164-NEXT: s_not_b32 s9, s2 ; GFX1164-NEXT: s_mov_b32 s6, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s3 +; GFX1164-NEXT: v_mov_b32_e32 v0, s3 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_and_or_b32 v0, v1, s9, s10 -; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1164-NEXT: v_and_or_b32 v1, v2, s9, s10 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v2 +; GFX1164-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: v_lshrrev_b32_e32 v0, s8, v2 +; GFX1164-NEXT: v_lshrrev_b32_e32 v0, s8, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -10392,24 +10411,26 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX1132-NEXT: s_not_b32 s3, s3 ; GFX1132-NEXT: s_mov_b32 s6, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s7 +; GFX1132-NEXT: v_mov_b32_e32 v0, s7 ; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_and_or_b32 v0, v1, s3, s8 -; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX1132-NEXT: v_mov_b32_e32 v2, v0 +; GFX1132-NEXT: v_and_or_b32 v1, v2, s3, s8 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v2 +; GFX1132-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 ; GFX1132-NEXT: s_or_b32 s9, vcc_lo, s9 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s9 ; GFX1132-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1132-NEXT: v_lshrrev_b32_e32 v0, s2, v2 +; GFX1132-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -10433,25 +10454,26 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX1264-NEXT: s_not_b32 s9, s2 ; GFX1264-NEXT: s_mov_b32 s6, -1 ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: v_mov_b32_e32 v1, s3 +; GFX1264-NEXT: v_mov_b32_e32 v0, s3 ; GFX1264-NEXT: s_mov_b64 s[2:3], 0 ; GFX1264-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264-NEXT: v_and_or_b32 v0, v1, s9, s10 -; GFX1264-NEXT: v_mov_b32_e32 v3, v1 +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mov_b32_e32 v2, v0 -; GFX1264-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1264-NEXT: s_wait_loadcnt 0x0 -; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1264-NEXT: v_and_or_b32 v1, v2, s9, s10 +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264-NEXT: v_mov_b32_e32 v0, v1 ; GFX1264-NEXT: v_mov_b32_e32 v1, v2 +; GFX1264-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1264-NEXT: s_wait_loadcnt 0x0 +; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX1264-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1264-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1264-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1264-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1264-NEXT: v_lshrrev_b32_e32 v0, s8, v2 +; GFX1264-NEXT: v_lshrrev_b32_e32 v0, s8, v0 ; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-NEXT: s_mov_b32 s2, -1 ; GFX1264-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -10475,24 +10497,26 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp ; GFX1232-NEXT: s_not_b32 s3, s3 ; GFX1232-NEXT: s_mov_b32 s6, -1 ; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: v_mov_b32_e32 v1, s7 +; GFX1232-NEXT: v_mov_b32_e32 v0, s7 ; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1232-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-NEXT: v_and_or_b32 v0, v1, s3, s8 -; GFX1232-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1232-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1232-NEXT: s_wait_loadcnt 0x0 -; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX1232-NEXT: v_mov_b32_e32 v2, v0 +; GFX1232-NEXT: v_and_or_b32 v1, v2, s3, s8 +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232-NEXT: v_mov_b32_e32 v0, v1 ; GFX1232-NEXT: v_mov_b32_e32 v1, v2 +; GFX1232-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1232-NEXT: s_wait_loadcnt 0x0 +; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 ; GFX1232-NEXT: s_or_b32 s9, vcc_lo, s9 ; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_and_not1_b32 exec_lo, exec_lo, s9 ; GFX1232-NEXT: s_cbranch_execnz .LBB17_1 ; GFX1232-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1232-NEXT: v_lshrrev_b32_e32 v0, s2, v2 +; GFX1232-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-NEXT: s_mov_b32 s2, -1 ; GFX1232-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -10518,35 +10542,36 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX7LESS-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7LESS-NEXT: s_lshl_b32 s2, 0xffff, s10 -; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX7LESS-NEXT: s_not_b32 s2, s2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s3 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s10, v4 ; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7LESS-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX7LESS-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v0, s10, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_and_b32_e32 v2, s2, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX7LESS-NEXT: v_and_b32_e32 v1, s2, v4 +; GFX7LESS-NEXT: v_or_b32_e32 v3, v1, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB18_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s10, v0 ; GFX7LESS-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -10566,28 +10591,28 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX8-NEXT: s_lshl_b32 s2, 0xffff, s10 ; GFX8-NEXT: s_not_b32 s2, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 ; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX8-NEXT: v_add_f16_e32 v0, s11, v0 -; GFX8-NEXT: v_and_b32_e32 v2, s2, v1 +; GFX8-NEXT: v_and_b32_e32 v1, s2, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, s10, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX8-NEXT: v_or_b32_e32 v1, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v0 ; GFX8-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -10607,27 +10632,27 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s10 ; GFX9-NEXT: s_not_b32 s2, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX9-NEXT: v_add_f16_e32 v0, s11, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, s10, v0 -; GFX9-NEXT: v_and_or_b32 v0, v1, s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX9-NEXT: v_and_or_b32 v1, v2, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_cbranch_execnz .LBB18_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, s10, v0 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -10647,26 +10672,26 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1064-NEXT: s_lshl_b32 s2, 0xffff, s9 ; GFX1064-NEXT: s_not_b32 s10, s2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s3 +; GFX1064-NEXT: v_mov_b32_e32 v0, s3 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s9, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s9, v2 ; GFX1064-NEXT: v_add_f16_e32 v0, s8, v0 ; GFX1064-NEXT: v_lshlrev_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX1064-NEXT: v_and_or_b32 v0, v1, s10, v0 -; GFX1064-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1064-NEXT: v_and_or_b32 v1, v2, s10, v0 +; GFX1064-NEXT: v_mov_b32_e32 v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v2 +; GFX1064-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB18_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s9, v2 +; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s9, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -10688,26 +10713,26 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1032-NEXT: s_lshl_b32 s3, 0xffff, s2 ; GFX1032-NEXT: s_not_b32 s3, s3 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s6 +; GFX1032-NEXT: v_mov_b32_e32 v0, s6 ; GFX1032-NEXT: s_mov_b32 s6, -1 ; GFX1032-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v2 ; GFX1032-NEXT: v_add_f16_e32 v0, s8, v0 ; GFX1032-NEXT: v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX1032-NEXT: v_and_or_b32 v0, v1, s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX1032-NEXT: v_and_or_b32 v1, v2, s3, v0 +; GFX1032-NEXT: v_mov_b32_e32 v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v2 +; GFX1032-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 ; GFX1032-NEXT: s_or_b32 s9, vcc_lo, s9 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; GFX1032-NEXT: s_cbranch_execnz .LBB18_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v2 +; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -10730,31 +10755,32 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1164-TRUE16-NEXT: s_lshl_b32 s2, 0xffff, s9 ; GFX1164-TRUE16-NEXT: s_not_b32 s10, s2 ; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, s3 +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v0, s3 ; GFX1164-TRUE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v1 -; GFX1164-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v2 ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l ; GFX1164-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s9, v0 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-TRUE16-NEXT: v_and_or_b32 v0, v1, s10, v0 -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc -; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1164-TRUE16-NEXT: v_and_or_b32 v1, v2, s10, v0 +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc +; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX1164-TRUE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-TRUE16-NEXT: s_cbranch_execnz .LBB18_1 ; GFX1164-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v2 +; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v0 ; GFX1164-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1164-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -10777,31 +10803,32 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1164-FAKE16-NEXT: s_lshl_b32 s2, 0xffff, s9 ; GFX1164-FAKE16-NEXT: s_not_b32 s10, s2 ; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, s3 +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v0, s3 ; GFX1164-FAKE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1164-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s9, v1 -; GFX1164-FAKE16-NEXT: v_add_f16_e32 v0, s8, v0 +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s9, v2 ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-FAKE16-NEXT: v_add_f16_e32 v0, s8, v0 ; GFX1164-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s9, v0 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-FAKE16-NEXT: v_and_or_b32 v0, v1, s10, v0 -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc -; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1164-FAKE16-NEXT: v_and_or_b32 v1, v2, s10, v0 +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc +; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX1164-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-FAKE16-NEXT: s_cbranch_execnz .LBB18_1 ; GFX1164-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s9, v2 +; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s9, v0 ; GFX1164-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1164-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -10824,30 +10851,32 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1132-TRUE16-NEXT: s_lshl_b32 s3, 0xffff, s2 ; GFX1132-TRUE16-NEXT: s_not_b32 s3, s3 ; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v0, s6 ; GFX1132-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX1132-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1 -; GFX1132-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l +; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l ; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1132-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_and_or_b32 v0, v1, s3, v0 -; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc -; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX1132-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0 +; GFX1132-TRUE16-NEXT: v_and_or_b32 v1, v2, s3, v0 +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc +; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 ; GFX1132-TRUE16-NEXT: s_or_b32 s9, vcc_lo, s9 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s9 ; GFX1132-TRUE16-NEXT: s_cbranch_execnz .LBB18_1 ; GFX1132-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 +; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1132-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -10870,30 +10899,32 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1132-FAKE16-NEXT: s_lshl_b32 s3, 0xffff, s2 ; GFX1132-FAKE16-NEXT: s_not_b32 s3, s3 ; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v0, s6 ; GFX1132-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX1132-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1132-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1 -; GFX1132-FAKE16-NEXT: v_add_f16_e32 v0, s8, v0 +; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-FAKE16-NEXT: v_add_f16_e32 v0, s8, v0 ; GFX1132-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1132-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_and_or_b32 v0, v1, s3, v0 -; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc -; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX1132-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0 +; GFX1132-FAKE16-NEXT: v_and_or_b32 v1, v2, s3, v0 +; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc +; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 ; GFX1132-FAKE16-NEXT: s_or_b32 s9, vcc_lo, s9 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s9 ; GFX1132-FAKE16-NEXT: s_cbranch_execnz .LBB18_1 ; GFX1132-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 +; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX1132-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1132-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -10916,31 +10947,32 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1264-TRUE16-NEXT: s_lshl_b32 s2, 0xffff, s9 ; GFX1264-TRUE16-NEXT: s_not_b32 s10, s2 ; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, s3 +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v0, s3 ; GFX1264-TRUE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1264-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v1 -; GFX1264-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v2 ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l ; GFX1264-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s9, v0 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264-TRUE16-NEXT: v_and_or_b32 v0, v1, s10, v0 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0 -; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1264-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1264-TRUE16-NEXT: v_and_or_b32 v1, v2, s10, v0 +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v0, v1 ; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1264-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX1264-TRUE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1264-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX1264-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1264-TRUE16-NEXT: s_cbranch_execnz .LBB18_1 ; GFX1264-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v2 +; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v0 ; GFX1264-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1264-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -10963,31 +10995,32 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1264-FAKE16-NEXT: s_lshl_b32 s2, 0xffff, s9 ; GFX1264-FAKE16-NEXT: s_not_b32 s10, s2 ; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, s3 +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v0, s3 ; GFX1264-FAKE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1264-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1264-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s9, v1 -; GFX1264-FAKE16-NEXT: v_add_f16_e32 v0, s8, v0 +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s9, v2 ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-FAKE16-NEXT: v_add_f16_e32 v0, s8, v0 ; GFX1264-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s9, v0 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264-FAKE16-NEXT: v_and_or_b32 v0, v1, s10, v0 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0 -; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1264-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1264-FAKE16-NEXT: v_and_or_b32 v1, v2, s10, v0 +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v0, v1 ; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1264-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX1264-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1264-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX1264-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1264-FAKE16-NEXT: s_cbranch_execnz .LBB18_1 ; GFX1264-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s9, v2 +; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s9, v0 ; GFX1264-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1264-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -11010,30 +11043,32 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1232-TRUE16-NEXT: s_lshl_b32 s3, 0xffff, s2 ; GFX1232-TRUE16-NEXT: s_not_b32 s3, s3 ; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v0, s6 ; GFX1232-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX1232-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1 -; GFX1232-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l +; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l ; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1232-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_and_or_b32 v0, v1, s3, v0 -; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX1232-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0 +; GFX1232-TRUE16-NEXT: v_and_or_b32 v1, v2, s3, v0 +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v0, v1 ; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 ; GFX1232-TRUE16-NEXT: s_or_b32 s9, vcc_lo, s9 ; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX1232-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s9 ; GFX1232-TRUE16-NEXT: s_cbranch_execnz .LBB18_1 ; GFX1232-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 +; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1232-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -11056,30 +11091,32 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1232-FAKE16-NEXT: s_lshl_b32 s3, 0xffff, s2 ; GFX1232-FAKE16-NEXT: s_not_b32 s3, s3 ; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v0, s6 ; GFX1232-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX1232-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1232-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1 -; GFX1232-FAKE16-NEXT: v_add_f16_e32 v0, s8, v0 +; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232-FAKE16-NEXT: v_add_f16_e32 v0, s8, v0 ; GFX1232-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1232-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_and_or_b32 v0, v1, s3, v0 -; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX1232-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0 +; GFX1232-FAKE16-NEXT: v_and_or_b32 v1, v2, s3, v0 +; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v0, v1 ; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 ; GFX1232-FAKE16-NEXT: s_or_b32 s9, vcc_lo, s9 ; GFX1232-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX1232-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s9 ; GFX1232-FAKE16-NEXT: s_cbranch_execnz .LBB18_1 ; GFX1232-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 +; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX1232-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1232-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -11105,35 +11142,36 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX7LESS-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX7LESS-NEXT: v_mul_f32_e64 v0, 1.0, s6 ; GFX7LESS-NEXT: s_lshl_b32 s2, 0xffff, s10 -; GFX7LESS-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 ; GFX7LESS-NEXT: s_not_b32 s2, s2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s3 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s10, v4 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7LESS-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX7LESS-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v0, s10, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_and_b32_e32 v2, s2, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX7LESS-NEXT: v_and_b32_e32 v1, s2, v4 +; GFX7LESS-NEXT: v_or_b32_e32 v3, v1, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s10, v0 ; GFX7LESS-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -11152,37 +11190,37 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX8-NEXT: s_lshl_b32 s2, s6, 16 ; GFX8-NEXT: s_not_b32 s3, s3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v3, s2, v3 -; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX8-NEXT: v_and_b32_e32 v2, s3, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s10 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX8-NEXT: v_lshrrev_b32_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v0, s2, v0 +; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX8-NEXT: v_and_b32_e32 v3, s3, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v0 ; GFX8-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -11202,34 +11240,34 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX9-NEXT: s_lshl_b32 s2, s6, 16 ; GFX9-NEXT: s_not_b32 s3, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_lshrrev_b32_sdwa v0, s10, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_f32_e32 v0, s2, v0 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s11 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s11 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_or_b32 v0, v1, s3, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX9-NEXT: v_and_or_b32 v1, v2, s3, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_cbranch_execnz .LBB19_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, s10, v0 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -11250,31 +11288,31 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1064-NEXT: s_not_b32 s9, s2 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s7 +; GFX1064-NEXT: v_mov_b32_e32 v0, s7 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: v_lshrrev_b32_sdwa v0, s8, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX1064-NEXT: v_add_f32_e32 v0, s10, v0 -; GFX1064-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX1064-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX1064-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX1064-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX1064-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX1064-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX1064-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX1064-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX1064-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX1064-NEXT: v_and_or_b32 v0, v1, s9, v0 -; GFX1064-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1064-NEXT: v_and_or_b32 v1, v2, s9, v0 +; GFX1064-NEXT: v_mov_b32_e32 v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v2 +; GFX1064-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB19_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s8, v2 +; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s8, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -11297,31 +11335,31 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1032-NEXT: s_not_b32 s8, s3 ; GFX1032-NEXT: s_mov_b32 s3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s7 +; GFX1032-NEXT: v_mov_b32_e32 v0, s7 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_lshrrev_b32_sdwa v0, s2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: v_lshrrev_b32_sdwa v0, s2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX1032-NEXT: v_add_f32_e32 v0, s9, v0 -; GFX1032-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX1032-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX1032-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX1032-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1032-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX1032-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX1032-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX1032-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc_lo ; GFX1032-NEXT: v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX1032-NEXT: v_and_or_b32 v0, v1, s8, v0 -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX1032-NEXT: v_and_or_b32 v1, v2, s8, v0 +; GFX1032-NEXT: v_mov_b32_e32 v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v2 +; GFX1032-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 ; GFX1032-NEXT: s_or_b32 s3, vcc_lo, s3 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: s_cbranch_execnz .LBB19_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v2 +; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -11344,41 +11382,42 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1164-TRUE16-NEXT: s_not_b32 s9, s2 ; GFX1164-TRUE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, s7 +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v0, s7 ; GFX1164-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164-TRUE16-NEXT: .p2align 6 ; GFX1164-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s8, v1 -; GFX1164-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s8, v2 ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1164-TRUE16-NEXT: v_add_f32_e32 v0, s10, v0 -; GFX1164-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1164-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX1164-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX1164-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX1164-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h -; GFX1164-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s8, v2 +; GFX1164-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-TRUE16-NEXT: v_and_or_b32 v0, v1, s9, v0 -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc -; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1164-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s8, v1 +; GFX1164-TRUE16-NEXT: v_and_or_b32 v1, v2, s9, v0 +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc +; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX1164-TRUE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-TRUE16-NEXT: s_cbranch_execnz .LBB19_1 ; GFX1164-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s8, v2 +; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s8, v0 ; GFX1164-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1164-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -11401,40 +11440,41 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1164-FAKE16-NEXT: s_not_b32 s9, s2 ; GFX1164-FAKE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, s7 +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v0, s7 ; GFX1164-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164-FAKE16-NEXT: .p2align 6 ; GFX1164-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX1164-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s8, v1 -; GFX1164-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s8, v2 ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1164-FAKE16-NEXT: v_add_f32_e32 v0, s10, v0 -; GFX1164-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1164-FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX1164-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX1164-FAKE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX1164-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX1164-FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s8, v0 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-FAKE16-NEXT: v_and_or_b32 v0, v1, s9, v0 -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc -; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1164-FAKE16-NEXT: v_and_or_b32 v1, v2, s9, v0 +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc +; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX1164-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-FAKE16-NEXT: s_cbranch_execnz .LBB19_1 ; GFX1164-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s8, v2 +; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s8, v0 ; GFX1164-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1164-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -11457,40 +11497,42 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1132-TRUE16-NEXT: s_not_b32 s8, s3 ; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0 ; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, s7 +; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v0, s7 ; GFX1132-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132-TRUE16-NEXT: .p2align 6 ; GFX1132-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1 -; GFX1132-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1132-TRUE16-NEXT: v_add_f32_e32 v0, s9, v0 -; GFX1132-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1132-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX1132-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX1132-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX1132-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX1132-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h -; GFX1132-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v2 +; GFX1132-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc_lo +; GFX1132-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX1132-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_and_or_b32 v0, v1, s8, v0 -; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc -; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX1132-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v1 +; GFX1132-TRUE16-NEXT: v_and_or_b32 v1, v2, s8, v0 +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc +; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 ; GFX1132-TRUE16-NEXT: s_or_b32 s3, vcc_lo, s3 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3 ; GFX1132-TRUE16-NEXT: s_cbranch_execnz .LBB19_1 ; GFX1132-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 +; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1132-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -11513,39 +11555,41 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1132-FAKE16-NEXT: s_not_b32 s8, s3 ; GFX1132-FAKE16-NEXT: s_mov_b32 s3, 0 ; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, s7 +; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v0, s7 ; GFX1132-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132-FAKE16-NEXT: .p2align 6 ; GFX1132-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX1132-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1 -; GFX1132-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1132-FAKE16-NEXT: v_add_f32_e32 v0, s9, v0 -; GFX1132-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1132-FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX1132-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX1132-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX1132-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX1132-FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc_lo ; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX1132-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_and_or_b32 v0, v1, s8, v0 -; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc -; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX1132-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0 +; GFX1132-FAKE16-NEXT: v_and_or_b32 v1, v2, s8, v0 +; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc +; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 ; GFX1132-FAKE16-NEXT: s_or_b32 s3, vcc_lo, s3 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3 ; GFX1132-FAKE16-NEXT: s_cbranch_execnz .LBB19_1 ; GFX1132-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 +; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX1132-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1132-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -11568,41 +11612,42 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1264-TRUE16-NEXT: s_not_b32 s9, s2 ; GFX1264-TRUE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, s7 +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v0, s7 ; GFX1264-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1264-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s8, v1 -; GFX1264-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s8, v2 ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1264-TRUE16-NEXT: v_add_f32_e32 v0, s10, v0 -; GFX1264-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1264-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX1264-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX1264-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX1264-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX1264-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX1264-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h -; GFX1264-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s8, v2 ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264-TRUE16-NEXT: v_and_or_b32 v0, v1, s9, v0 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0 -; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1264-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1264-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s8, v1 +; GFX1264-TRUE16-NEXT: v_and_or_b32 v1, v2, s9, v0 +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v0, v1 ; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1264-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX1264-TRUE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1264-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1264-TRUE16-NEXT: s_cbranch_execnz .LBB19_1 ; GFX1264-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s8, v2 +; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s8, v0 ; GFX1264-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1264-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -11625,40 +11670,41 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1264-FAKE16-NEXT: s_not_b32 s9, s2 ; GFX1264-FAKE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, s7 +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v0, s7 ; GFX1264-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1264-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX1264-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s8, v1 -; GFX1264-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s8, v2 ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1264-FAKE16-NEXT: v_add_f32_e32 v0, s10, v0 -; GFX1264-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1264-FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX1264-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX1264-FAKE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1264-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX1264-FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX1264-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX1264-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s8, v0 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264-FAKE16-NEXT: v_and_or_b32 v0, v1, s9, v0 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0 -; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1264-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1264-FAKE16-NEXT: v_and_or_b32 v1, v2, s9, v0 +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v0, v1 ; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1264-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX1264-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1264-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1264-FAKE16-NEXT: s_cbranch_execnz .LBB19_1 ; GFX1264-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s8, v2 +; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s8, v0 ; GFX1264-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1264-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -11681,40 +11727,42 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1232-TRUE16-NEXT: s_not_b32 s8, s3 ; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0 ; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, s7 +; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v0, s7 ; GFX1232-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1232-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1 -; GFX1232-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1232-TRUE16-NEXT: v_add_f32_e32 v0, s9, v0 -; GFX1232-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1232-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX1232-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX1232-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX1232-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX1232-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h -; GFX1232-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v2 +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1232-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc_lo +; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_and_or_b32 v0, v1, s8, v0 -; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX1232-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v1 +; GFX1232-TRUE16-NEXT: v_and_or_b32 v1, v2, s8, v0 +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v0, v1 ; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 ; GFX1232-TRUE16-NEXT: s_or_b32 s3, vcc_lo, s3 ; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX1232-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3 ; GFX1232-TRUE16-NEXT: s_cbranch_execnz .LBB19_1 ; GFX1232-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 +; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1232-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -11737,39 +11785,41 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1232-FAKE16-NEXT: s_not_b32 s8, s3 ; GFX1232-FAKE16-NEXT: s_mov_b32 s3, 0 ; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, s7 +; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v0, s7 ; GFX1232-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1232-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX1232-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1 -; GFX1232-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1232-FAKE16-NEXT: v_add_f32_e32 v0, s9, v0 -; GFX1232-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1232-FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX1232-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX1232-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX1232-FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX1232-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX1232-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc_lo ; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX1232-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_and_or_b32 v0, v1, s8, v0 -; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX1232-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0 +; GFX1232-FAKE16-NEXT: v_and_or_b32 v1, v2, s8, v0 +; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v0, v1 ; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 ; GFX1232-FAKE16-NEXT: s_or_b32 s3, vcc_lo, s3 ; GFX1232-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX1232-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3 ; GFX1232-FAKE16-NEXT: s_cbranch_execnz .LBB19_1 ; GFX1232-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 +; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX1232-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1232-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -11851,28 +11901,28 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX8-NEXT: s_lshr_b32 s11, s10, 16 ; GFX8-NEXT: s_mov_b32 s4, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NEXT: s_mov_b32 s5, s3 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s11 -; GFX8-NEXT: v_add_f16_e32 v2, s10, v1 -; GFX8-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s11 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX8-NEXT: v_add_f16_sdwa v0, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_add_f16_e32 v1, s10, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: uniform_fadd_v2f16: @@ -11886,25 +11936,25 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX9-NEXT: s_mov_b32 s4, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: s_mov_b32 s5, s3 ; GFX9-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_pk_add_f16 v0, v1, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX9-NEXT: v_pk_add_f16 v1, v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_cbranch_execnz .LBB20_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: uniform_fadd_v2f16: @@ -11919,17 +11969,17 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX1064-NEXT: s_mov_b32 s5, s3 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064-NEXT: s_mov_b32 s4, s2 ; GFX1064-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_pk_add_f16 v0, v1, s10 -; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1064-NEXT: v_pk_add_f16 v1, v2, s10 +; GFX1064-NEXT: v_mov_b32_e32 v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v2 +; GFX1064-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX1064-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX1064-NEXT: s_cbranch_execnz .LBB20_1 @@ -11937,7 +11987,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: uniform_fadd_v2f16: @@ -11952,17 +12002,17 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX1032-NEXT: s_mov_b32 s5, s3 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-NEXT: v_mov_b32_e32 v0, s4 ; GFX1032-NEXT: s_mov_b32 s4, s2 ; GFX1032-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_pk_add_f16 v0, v1, s8 -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX1032-NEXT: v_pk_add_f16 v1, v2, s8 +; GFX1032-NEXT: v_mov_b32_e32 v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v2 +; GFX1032-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 ; GFX1032-NEXT: s_or_b32 s9, vcc_lo, s9 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; GFX1032-NEXT: s_cbranch_execnz .LBB20_1 @@ -11970,7 +12020,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: uniform_fadd_v2f16: @@ -11985,18 +12035,19 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX1164-NEXT: s_load_b32 s4, s[2:3], 0x0 ; GFX1164-NEXT: s_mov_b32 s5, s3 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-NEXT: v_mov_b32_e32 v0, s4 ; GFX1164-NEXT: s_mov_b32 s4, s2 ; GFX1164-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_pk_add_f16 v0, v1, s10 -; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1164-NEXT: v_pk_add_f16 v1, v2, s10 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v2 +; GFX1164-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX1164-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[8:9] @@ -12005,7 +12056,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: buffer_store_b32 v2, off, s[0:3], 0 +; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: uniform_fadd_v2f16: @@ -12020,17 +12071,19 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX1132-NEXT: s_load_b32 s4, s[2:3], 0x0 ; GFX1132-NEXT: s_mov_b32 s5, s3 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s4 +; GFX1132-NEXT: v_mov_b32_e32 v0, s4 ; GFX1132-NEXT: s_mov_b32 s4, s2 ; GFX1132-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_pk_add_f16 v0, v1, s8 -; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX1132-NEXT: v_mov_b32_e32 v2, v0 +; GFX1132-NEXT: v_pk_add_f16 v1, v2, s8 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v2 +; GFX1132-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 ; GFX1132-NEXT: s_or_b32 s9, vcc_lo, s9 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s9 @@ -12039,7 +12092,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: buffer_store_b32 v2, off, s[0:3], 0 +; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_endpgm ; ; GFX1264-LABEL: uniform_fadd_v2f16: @@ -12054,18 +12107,19 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX1264-NEXT: s_load_b32 s4, s[2:3], 0x0 ; GFX1264-NEXT: s_mov_b32 s5, s3 ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: v_mov_b32_e32 v1, s4 +; GFX1264-NEXT: v_mov_b32_e32 v0, s4 ; GFX1264-NEXT: s_mov_b32 s4, s2 ; GFX1264-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264-NEXT: v_pk_add_f16 v0, v1, s10 -; GFX1264-NEXT: v_mov_b32_e32 v3, v1 +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mov_b32_e32 v2, v0 -; GFX1264-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1264-NEXT: s_wait_loadcnt 0x0 -; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1264-NEXT: v_pk_add_f16 v1, v2, s10 +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264-NEXT: v_mov_b32_e32 v0, v1 ; GFX1264-NEXT: v_mov_b32_e32 v1, v2 +; GFX1264-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1264-NEXT: s_wait_loadcnt 0x0 +; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX1264-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1264-NEXT: s_and_not1_b64 exec, exec, s[8:9] @@ -12074,7 +12128,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX1264-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-NEXT: s_mov_b32 s2, -1 -; GFX1264-NEXT: buffer_store_b32 v2, off, s[0:3], null +; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1264-NEXT: s_endpgm ; ; GFX1232-LABEL: uniform_fadd_v2f16: @@ -12089,17 +12143,19 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX1232-NEXT: s_load_b32 s4, s[2:3], 0x0 ; GFX1232-NEXT: s_mov_b32 s5, s3 ; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: v_mov_b32_e32 v1, s4 +; GFX1232-NEXT: v_mov_b32_e32 v0, s4 ; GFX1232-NEXT: s_mov_b32 s4, s2 ; GFX1232-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-NEXT: v_pk_add_f16 v0, v1, s8 -; GFX1232-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1232-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1232-NEXT: s_wait_loadcnt 0x0 -; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX1232-NEXT: v_mov_b32_e32 v2, v0 +; GFX1232-NEXT: v_pk_add_f16 v1, v2, s8 +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232-NEXT: v_mov_b32_e32 v0, v1 ; GFX1232-NEXT: v_mov_b32_e32 v1, v2 +; GFX1232-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1232-NEXT: s_wait_loadcnt 0x0 +; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 ; GFX1232-NEXT: s_or_b32 s9, vcc_lo, s9 ; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_and_not1_b32 exec_lo, exec_lo, s9 @@ -12108,7 +12164,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-NEXT: s_mov_b32 s2, -1 -; GFX1232-NEXT: buffer_store_b32 v2, off, s[0:3], null +; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1232-NEXT: s_endpgm %rmw = atomicrmw fadd ptr addrspace(1) %uniform.ptr, <2 x half> %val monotonic, align 4 store <2 x half> %rmw, ptr addrspace(1) %result @@ -12186,41 +12242,41 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX8-NEXT: s_mov_b32 s4, s10 ; GFX8-NEXT: s_mov_b32 s5, s11 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX8-NEXT: v_add_f32_e32 v0, s12, v0 -; GFX8-NEXT: v_add_f32_e32 v2, s13, v2 +; GFX8-NEXT: v_add_f32_e32 v1, s13, v1 ; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v4, s[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_mov_b32 s11, 0xf000 ; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: buffer_store_dword v2, off, s[8:11], 0 +; GFX8-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: uniform_fadd_v2bf16: @@ -12237,40 +12293,40 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX9-NEXT: s_lshl_b32 s14, s0, 16 ; GFX9-NEXT: s_and_b32 s15, s0, 0xffff0000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_mov_b32 s4, s10 ; GFX9-NEXT: s_mov_b32 s5, s11 ; GFX9-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX9-NEXT: v_add_f32_e32 v0, s14, v0 -; GFX9-NEXT: v_add_f32_e32 v2, s15, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s15, v1 ; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX9-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v0, s12 -; GFX9-NEXT: v_add3_u32 v5, v5, v2, s12 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add3_u32 v5, v5, v1, s12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v4, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; GFX9-NEXT: v_perm_b32 v0, v2, v0, s13 -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX9-NEXT: v_perm_b32 v1, v1, v0, s13 +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB21_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: buffer_store_dword v2, off, s[8:11], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: uniform_fadd_v2bf16: @@ -12288,30 +12344,30 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1064-NEXT: s_mov_b32 s4, s10 ; GFX1064-NEXT: s_mov_b32 s5, s11 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s1 +; GFX1064-NEXT: v_mov_b32_e32 v0, s1 ; GFX1064-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX1064-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX1064-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX1064-NEXT: v_add_f32_e32 v0, s12, v0 -; GFX1064-NEXT: v_add_f32_e32 v2, s13, v2 +; GFX1064-NEXT: v_add_f32_e32 v1, s13, v1 ; GFX1064-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX1064-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX1064-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX1064-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX1064-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX1064-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX1064-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX1064-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX1064-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX1064-NEXT: v_add3_u32 v4, v4, v2, 0x7fff +; GFX1064-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX1064-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 -; GFX1064-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc +; GFX1064-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc ; GFX1064-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[0:1] -; GFX1064-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 -; GFX1064-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1064-NEXT: v_perm_b32 v1, v1, v0, 0x7060302 +; GFX1064-NEXT: v_mov_b32_e32 v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v2 +; GFX1064-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB21_1 @@ -12319,7 +12375,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: buffer_store_dword v2, off, s[8:11], 0 +; GFX1064-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: uniform_fadd_v2bf16: @@ -12336,31 +12392,31 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1032-NEXT: s_and_b32 s3, s0, 0xffff0000 ; GFX1032-NEXT: s_mov_b32 s5, s11 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-NEXT: v_mov_b32_e32 v0, s4 ; GFX1032-NEXT: s_mov_b32 s4, s10 ; GFX1032-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX1032-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX1032-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX1032-NEXT: v_add_f32_e32 v0, s2, v0 -; GFX1032-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1032-NEXT: v_add_f32_e32 v1, s3, v1 ; GFX1032-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX1032-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX1032-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX1032-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX1032-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX1032-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX1032-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX1032-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX1032-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX1032-NEXT: v_add3_u32 v4, v4, v2, 0x7fff +; GFX1032-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX1032-NEXT: v_cmp_u_f32_e64 s0, v0, v0 -; GFX1032-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo +; GFX1032-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo ; GFX1032-NEXT: v_cndmask_b32_e64 v0, v3, v5, s0 -; GFX1032-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX1032-NEXT: v_perm_b32 v1, v1, v0, 0x7060302 +; GFX1032-NEXT: v_mov_b32_e32 v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v2 +; GFX1032-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 ; GFX1032-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 ; GFX1032-NEXT: s_cbranch_execnz .LBB21_1 @@ -12368,7 +12424,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: buffer_store_dword v2, off, s[8:11], 0 +; GFX1032-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-TRUE16-LABEL: uniform_fadd_v2bf16: @@ -12385,40 +12441,40 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1164-TRUE16-NEXT: s_lshl_b32 s11, s4, 16 ; GFX1164-TRUE16-NEXT: s_mov_b32 s4, s2 ; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, s5 +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v0, s5 ; GFX1164-TRUE16-NEXT: s_mov_b32 s5, s3 ; GFX1164-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-TRUE16-NEXT: .p2align 6 ; GFX1164-TRUE16-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX1164-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX1164-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-TRUE16-NEXT: v_add_f32_e32 v0, s11, v0 +; GFX1164-TRUE16-NEXT: v_add_f32_e32 v1, s10, v1 ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-TRUE16-NEXT: v_add_f32_e32 v2, s10, v2 ; GFX1164-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX1164-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX1164-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX1164-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX1164-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX1164-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX1164-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX1164-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff +; GFX1164-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc -; GFX1164-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX1164-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc -; GFX1164-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2 -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc -; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1164-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc +; GFX1164-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v0, v1 +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc +; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX1164-TRUE16-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[8:9] @@ -12428,7 +12484,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1164-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX1164-TRUE16-NEXT: buffer_store_b32 v2, off, s[0:3], 0 +; GFX1164-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-TRUE16-NEXT: s_endpgm ; ; GFX1164-FAKE16-LABEL: uniform_fadd_v2bf16: @@ -12446,37 +12502,37 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1164-FAKE16-NEXT: s_mov_b32 s4, s10 ; GFX1164-FAKE16-NEXT: s_mov_b32 s5, s11 ; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, s1 +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v0, s1 ; GFX1164-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-FAKE16-NEXT: .p2align 6 ; GFX1164-FAKE16-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX1164-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX1164-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX1164-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-FAKE16-NEXT: v_add_f32_e32 v0, s12, v0 +; GFX1164-FAKE16-NEXT: v_add_f32_e32 v1, s13, v1 ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-FAKE16-NEXT: v_add_f32_e32 v2, s13, v2 ; GFX1164-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX1164-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX1164-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX1164-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX1164-FAKE16-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX1164-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX1164-FAKE16-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX1164-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX1164-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff +; GFX1164-FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX1164-FAKE16-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc +; GFX1164-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc ; GFX1164-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[0:1] -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc -; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-FAKE16-NEXT: v_perm_b32 v1, v1, v0, 0x7060302 +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v0, v1 ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc +; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX1164-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] @@ -12486,7 +12542,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1164-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1164-FAKE16-NEXT: buffer_store_b32 v2, off, s[8:11], 0 +; GFX1164-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 ; GFX1164-FAKE16-NEXT: s_endpgm ; ; GFX1132-TRUE16-LABEL: uniform_fadd_v2bf16: @@ -12503,39 +12559,39 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1132-TRUE16-NEXT: s_lshl_b32 s10, s4, 16 ; GFX1132-TRUE16-NEXT: s_mov_b32 s4, s2 ; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, s5 +; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v0, s5 ; GFX1132-TRUE16-NEXT: s_mov_b32 s5, s3 ; GFX1132-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-TRUE16-NEXT: .p2align 6 ; GFX1132-TRUE16-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX1132-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1132-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-TRUE16-NEXT: v_dual_add_f32 v1, s9, v1 :: v_dual_lshlrev_b32 v0, 16, v2 +; GFX1132-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-TRUE16-NEXT: v_add_f32_e32 v0, s10, v0 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-TRUE16-NEXT: v_add_f32_e32 v2, s9, v2 +; GFX1132-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX1132-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1132-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1 ; GFX1132-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX1132-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1132-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; GFX1132-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX1132-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1132-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo -; GFX1132-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX1132-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX1132-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo ; GFX1132-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo -; GFX1132-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc -; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v0, v1 +; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc +; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 ; GFX1132-TRUE16-NEXT: s_or_b32 s8, vcc_lo, s8 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s8 @@ -12545,7 +12601,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX1132-TRUE16-NEXT: buffer_store_b32 v2, off, s[0:3], 0 +; GFX1132-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-TRUE16-NEXT: s_endpgm ; ; GFX1132-FAKE16-LABEL: uniform_fadd_v2bf16: @@ -12562,37 +12618,38 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1132-FAKE16-NEXT: s_and_b32 s3, s0, 0xffff0000 ; GFX1132-FAKE16-NEXT: s_mov_b32 s5, s11 ; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v0, s4 ; GFX1132-FAKE16-NEXT: s_mov_b32 s4, s10 ; GFX1132-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-FAKE16-NEXT: .p2align 6 ; GFX1132-FAKE16-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX1132-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX1132-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1132-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-FAKE16-NEXT: v_dual_add_f32 v1, s3, v1 :: v_dual_lshlrev_b32 v0, 16, v2 +; GFX1132-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX1132-FAKE16-NEXT: v_add_f32_e32 v0, s2, v0 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-FAKE16-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1132-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX1132-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX1132-FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX1132-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1 ; GFX1132-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX1132-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX1132-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX1132-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX1132-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff ; GFX1132-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v0, v0 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo +; GFX1132-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX1132-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s0 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 -; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc -; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX1132-FAKE16-NEXT: v_perm_b32 v1, v1, v0, 0x7060302 +; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc +; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 ; GFX1132-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -12602,7 +12659,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1132-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1132-FAKE16-NEXT: buffer_store_b32 v2, off, s[8:11], 0 +; GFX1132-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 ; GFX1132-FAKE16-NEXT: s_endpgm ; ; GFX1264-TRUE16-LABEL: uniform_fadd_v2bf16: @@ -12619,39 +12676,39 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1264-TRUE16-NEXT: s_lshl_b32 s11, s4, 16 ; GFX1264-TRUE16-NEXT: s_mov_b32 s4, s2 ; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, s5 +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v0, s5 ; GFX1264-TRUE16-NEXT: s_mov_b32 s5, s3 ; GFX1264-TRUE16-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX1264-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1264-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX1264-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1264-TRUE16-NEXT: v_add_f32_e32 v0, s11, v0 +; GFX1264-TRUE16-NEXT: v_add_f32_e32 v1, s10, v1 ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1264-TRUE16-NEXT: v_add_f32_e32 v2, s10, v2 ; GFX1264-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1264-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX1264-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX1264-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX1264-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX1264-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX1264-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX1264-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX1264-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff +; GFX1264-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX1264-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1264-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc -; GFX1264-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX1264-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX1264-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX1264-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0 -; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1264-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1264-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v0, v1 +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v0, v1 ; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1264-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX1264-TRUE16-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1264-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[8:9] @@ -12660,7 +12717,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1264-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX1264-TRUE16-NEXT: buffer_store_b32 v2, off, s[0:3], null +; GFX1264-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1264-TRUE16-NEXT: s_endpgm ; ; GFX1264-FAKE16-LABEL: uniform_fadd_v2bf16: @@ -12678,37 +12735,37 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1264-FAKE16-NEXT: s_mov_b32 s4, s10 ; GFX1264-FAKE16-NEXT: s_mov_b32 s5, s11 ; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, s1 +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v0, s1 ; GFX1264-FAKE16-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX1264-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX1264-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1264-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX1264-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1264-FAKE16-NEXT: v_add_f32_e32 v0, s12, v0 +; GFX1264-FAKE16-NEXT: v_add_f32_e32 v1, s13, v1 ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1264-FAKE16-NEXT: v_add_f32_e32 v2, s13, v2 ; GFX1264-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1264-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX1264-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX1264-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX1264-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX1264-FAKE16-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX1264-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX1264-FAKE16-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX1264-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX1264-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff +; GFX1264-FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX1264-FAKE16-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 ; GFX1264-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc +; GFX1264-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc ; GFX1264-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX1264-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[0:1] -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0 -; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1264-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-FAKE16-NEXT: v_perm_b32 v1, v1, v0, 0x7060302 +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v0, v1 ; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1264-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX1264-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1264-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[2:3] @@ -12717,7 +12774,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1264-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1264-FAKE16-NEXT: buffer_store_b32 v2, off, s[8:11], null +; GFX1264-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], null ; GFX1264-FAKE16-NEXT: s_endpgm ; ; GFX1232-TRUE16-LABEL: uniform_fadd_v2bf16: @@ -12734,38 +12791,39 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1232-TRUE16-NEXT: s_lshl_b32 s10, s4, 16 ; GFX1232-TRUE16-NEXT: s_mov_b32 s4, s2 ; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, s5 +; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v0, s5 ; GFX1232-TRUE16-NEXT: s_mov_b32 s5, s3 ; GFX1232-TRUE16-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1232-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX1232-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1232-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232-TRUE16-NEXT: v_dual_add_f32 v1, s9, v1 :: v_dual_lshlrev_b32 v0, 16, v2 +; GFX1232-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1232-TRUE16-NEXT: v_add_f32_e32 v0, s10, v0 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1232-TRUE16-NEXT: v_add_f32_e32 v2, s9, v2 +; GFX1232-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX1232-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1232-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1232-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1 ; GFX1232-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX1232-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1232-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; GFX1232-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX1232-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff ; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX1232-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo -; GFX1232-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX1232-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX1232-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo +; GFX1232-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2 -; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX1232-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v0, v1 +; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v0, v1 ; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 ; GFX1232-TRUE16-NEXT: s_or_b32 s8, vcc_lo, s8 ; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX1232-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s8 @@ -12774,7 +12832,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX1232-TRUE16-NEXT: buffer_store_b32 v2, off, s[0:3], null +; GFX1232-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1232-TRUE16-NEXT: s_endpgm ; ; GFX1232-FAKE16-LABEL: uniform_fadd_v2bf16: @@ -12791,37 +12849,38 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1232-FAKE16-NEXT: s_and_b32 s3, s0, 0xffff0000 ; GFX1232-FAKE16-NEXT: s_mov_b32 s5, s11 ; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v0, s4 ; GFX1232-FAKE16-NEXT: s_mov_b32 s4, s10 ; GFX1232-FAKE16-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX1232-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1232-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX1232-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1232-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232-FAKE16-NEXT: v_dual_add_f32 v1, s3, v1 :: v_dual_lshlrev_b32 v0, 16, v2 +; GFX1232-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX1232-FAKE16-NEXT: v_add_f32_e32 v0, s2, v0 -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1232-FAKE16-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1232-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX1232-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX1232-FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1232-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1232-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1 ; GFX1232-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX1232-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX1232-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX1232-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX1232-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff ; GFX1232-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v0, v0 ; GFX1232-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1232-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo +; GFX1232-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX1232-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX1232-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX1232-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s0 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 -; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX1232-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s0 +; GFX1232-FAKE16-NEXT: v_perm_b32 v1, v1, v0, 0x7060302 +; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v0, v1 ; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 ; GFX1232-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX1232-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX1232-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -12830,7 +12889,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1232-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1232-FAKE16-NEXT: buffer_store_b32 v2, off, s[8:11], null +; GFX1232-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], null ; GFX1232-FAKE16-NEXT: s_endpgm %rmw = atomicrmw fadd ptr addrspace(1) %uniform.ptr, <2 x bfloat> %val monotonic, align 4 store <2 x bfloat> %rmw, ptr addrspace(1) %result diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 006fe51a32c72..a18eef6bd40aa 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -32,12 +32,12 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: S_CBRANCH_VCCZ %bb.2, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.1.bb103: - ; GFX90A-NEXT: successors: %bb.58(0x40000000), %bb.2(0x40000000) + ; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.2(0x40000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.58, implicit $vcc + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.59, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) @@ -51,14 +51,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.3.Flow17: - ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.57(0x40000000) + ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.58(0x40000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr30 = V_AND_B32_e32 1023, $vgpr31, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr15 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr17 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_VCCZ %bb.57, implicit $vcc + ; GFX90A-NEXT: S_CBRANCH_VCCZ %bb.58, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4.bb15: ; GFX90A-NEXT: successors: %bb.35(0x40000000), %bb.5(0x40000000) @@ -122,12 +122,12 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr24 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.7.Flow19: - ; GFX90A-NEXT: successors: %bb.62(0x40000000), %bb.8(0x40000000) + ; GFX90A-NEXT: successors: %bb.63(0x40000000), %bb.8(0x40000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_MOV_B64 0 ; GFX90A-NEXT: $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.62, implicit $exec + ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.63, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.8.Flow32: ; GFX90A-NEXT: successors: %bb.9(0x40000000), %bb.10(0x40000000) @@ -506,8 +506,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: S_BRANCH %bb.38 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.41.bb41: - ; GFX90A-NEXT: successors: %bb.46(0x40000000), %bb.42(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67 + ; GFX90A-NEXT: successors: %bb.47(0x40000000), %bb.42(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr62_sgpr63, $sgpr66_sgpr67 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc @@ -533,17 +533,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 ; GFX90A-NEXT: $sgpr42_sgpr43 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.46, implicit $exec + ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.47, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.42.Flow24: ; GFX90A-NEXT: successors: %bb.40(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc ; GFX90A-NEXT: renamable $vgpr59 = COPY killed renamable $vgpr18, implicit $exec ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc @@ -556,8 +556,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: S_BRANCH %bb.40 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.43.bb55: - ; GFX90A-NEXT: successors: %bb.48(0x40000000), %bb.44(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49 + ; GFX90A-NEXT: successors: %bb.49(0x40000000), %bb.44(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: S_BITCMP1_B32 killed renamable $sgpr17, 16, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_CSELECT_B64 -1, 0, implicit killed $scc @@ -565,13 +565,19 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr62 = V_ADD_CO_U32_e32 6144, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr63, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr50_sgpr51, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.48, implicit $vcc + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.49, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.44: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr57, $vgpr56, $vgpr18, $vgpr30, $vgpr31, $vgpr60, $vgpr62, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $vgpr61, $vgpr58, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr47, $vgpr46, $vgpr2, $vgpr3, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr40, $vgpr63 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49 ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: bb.45.Flow26: + ; GFX90A-NEXT: successors: %bb.46(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr57, $vgpr56, $vgpr18, $vgpr30, $vgpr31, $vgpr60, $vgpr62, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr47, $vgpr46, $vgpr2, $vgpr3, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr40, $vgpr63, $vgpr58, $vgpr61 + ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF @@ -586,8 +592,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.45.Flow26: - ; GFX90A-NEXT: successors: %bb.47(0x80000000) + ; GFX90A-NEXT: bb.46.Flow26: + ; GFX90A-NEXT: successors: %bb.48(0x80000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_XOR_B64 $exec, -1, implicit-def dead $scc @@ -595,16 +601,16 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr46_sgpr47, implicit-def dead $scc - ; GFX90A-NEXT: S_BRANCH %bb.47 + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr46_sgpr47, implicit-def dead $scc + ; GFX90A-NEXT: S_BRANCH %bb.48 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.46.bb48: - ; GFX90A-NEXT: successors: %bb.43(0x40000000), %bb.47(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr64_sgpr65, $sgpr50_sgpr51, $sgpr66_sgpr67, $sgpr44_sgpr45, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57 + ; GFX90A-NEXT: bb.47.bb48: + ; GFX90A-NEXT: successors: %bb.43(0x40000000), %bb.48(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr62_sgpr63, $sgpr50_sgpr51, $sgpr66_sgpr67, $sgpr44_sgpr45, $sgpr58_sgpr59, $sgpr56_sgpr57 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr60 = V_ADD_CO_U32_e32 5120, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc @@ -613,7 +619,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr0 = FLAT_LOAD_UBYTE killed renamable $vgpr0_vgpr1, 1024, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i51) ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr61, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec @@ -634,53 +640,56 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: $sgpr18_sgpr19 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.43, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.47.Flow25: + ; GFX90A-NEXT: bb.48.Flow25: ; GFX90A-NEXT: successors: %bb.42(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr48_sgpr49, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.42 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.48.bb63: - ; GFX90A-NEXT: successors: %bb.50(0x40000000), %bb.49(0x40000000) - ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49 + ; GFX90A-NEXT: bb.49.bb63: + ; GFX90A-NEXT: successors: %bb.51(0x40000000), %bb.50(0x40000000) + ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.50, implicit $vcc + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.51, implicit $vcc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.49: - ; GFX90A-NEXT: successors: %bb.44(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57 + ; GFX90A-NEXT: bb.50: + ; GFX90A-NEXT: successors: %bb.45(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr58_sgpr59, $sgpr56_sgpr57 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 -1 - ; GFX90A-NEXT: S_BRANCH %bb.44 + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 + ; GFX90A-NEXT: S_BRANCH %bb.45 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.50.bb68: - ; GFX90A-NEXT: successors: %bb.54(0x40000000), %bb.51(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57 + ; GFX90A-NEXT: bb.51.bb68: + ; GFX90A-NEXT: successors: %bb.55(0x40000000), %bb.52(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr58_sgpr59, $sgpr56_sgpr57 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 3, $vgpr30, implicit $exec ; GFX90A-NEXT: renamable $vgpr1 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr50_sgpr51, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.54, implicit $vcc + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.55, implicit $vcc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.51: - ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57 + ; GFX90A-NEXT: bb.52: + ; GFX90A-NEXT: successors: %bb.46(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr58_sgpr59, $sgpr56_sgpr57 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF @@ -692,20 +701,20 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 - ; GFX90A-NEXT: S_BRANCH %bb.45 + ; GFX90A-NEXT: S_BRANCH %bb.46 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.52.bb80: - ; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.53(0x40000000) + ; GFX90A-NEXT: bb.53.bb80: + ; GFX90A-NEXT: successors: %bb.60(0x40000000), %bb.54(0x40000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc ; GFX90A-NEXT: S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc ; GFX90A-NEXT: renamable $vgpr6 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr7, dead renamable $sgpr52_sgpr53 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.59, implicit killed $scc + ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.60, implicit killed $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.53: - ; GFX90A-NEXT: successors: %bb.61(0x80000000) + ; GFX90A-NEXT: bb.54: + ; GFX90A-NEXT: successors: %bb.62(0x80000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 @@ -720,17 +729,18 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 - ; GFX90A-NEXT: S_BRANCH %bb.61 + ; GFX90A-NEXT: S_BRANCH %bb.62 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.54.bb73: - ; GFX90A-NEXT: successors: %bb.52(0x40000000), %bb.55(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55 + ; GFX90A-NEXT: bb.55.bb73: + ; GFX90A-NEXT: successors: %bb.53(0x40000000), %bb.56(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr6 = FLAT_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i76) ; GFX90A-NEXT: renamable $vgpr4 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr5, dead renamable $sgpr58_sgpr59 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr6, implicit $exec ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 @@ -745,17 +755,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 ; GFX90A-NEXT: $sgpr60_sgpr61 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.52, implicit $exec + ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.53, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.55.Flow29: - ; GFX90A-NEXT: successors: %bb.45(0x80000000) + ; GFX90A-NEXT: bb.56.Flow29: + ; GFX90A-NEXT: successors: %bb.46(0x80000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr60_sgpr61, implicit-def $scc - ; GFX90A-NEXT: S_BRANCH %bb.45 + ; GFX90A-NEXT: S_BRANCH %bb.46 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.56.bb90: - ; GFX90A-NEXT: successors: %bb.60(0x80000000) + ; GFX90A-NEXT: bb.57.bb90: + ; GFX90A-NEXT: successors: %bb.61(0x80000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr53 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec @@ -773,9 +783,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $vgpr14, implicit $exec - ; GFX90A-NEXT: S_BRANCH %bb.60 + ; GFX90A-NEXT: S_BRANCH %bb.61 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.57: + ; GFX90A-NEXT: bb.58: ; GFX90A-NEXT: successors: %bb.7(0x80000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} @@ -810,7 +820,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 ; GFX90A-NEXT: S_BRANCH %bb.7 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.58.bb105: + ; GFX90A-NEXT: bb.59.bb105: ; GFX90A-NEXT: successors: %bb.3(0x80000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} @@ -827,8 +837,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 -1 ; GFX90A-NEXT: S_BRANCH %bb.3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.59.bb85: - ; GFX90A-NEXT: successors: %bb.56(0x40000000), %bb.60(0x40000000) + ; GFX90A-NEXT: bb.60.bb85: + ; GFX90A-NEXT: successors: %bb.57(0x40000000), %bb.61(0x40000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr8 = V_OR_B32_e32 1, $vgpr6, implicit $exec @@ -846,17 +856,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 ; GFX90A-NEXT: $sgpr54_sgpr55 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.56, implicit $exec + ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.57, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.60.Flow31: - ; GFX90A-NEXT: successors: %bb.61(0x80000000) + ; GFX90A-NEXT: bb.61.Flow31: + ; GFX90A-NEXT: successors: %bb.62(0x80000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr54_sgpr55, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.61.Flow30: - ; GFX90A-NEXT: successors: %bb.55(0x80000000) + ; GFX90A-NEXT: bb.62.Flow30: + ; GFX90A-NEXT: successors: %bb.56(0x80000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_XOR_B64 $exec, -1, implicit-def dead $scc @@ -865,48 +875,48 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr52_sgpr53, implicit-def dead $scc - ; GFX90A-NEXT: S_BRANCH %bb.55 + ; GFX90A-NEXT: S_BRANCH %bb.56 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.62.bb140: - ; GFX90A-NEXT: successors: %bb.68(0x40000000), %bb.63(0x40000000) + ; GFX90A-NEXT: bb.63.bb140: + ; GFX90A-NEXT: successors: %bb.69(0x40000000), %bb.64(0x40000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.68, implicit $vcc + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.69, implicit $vcc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.63.Flow13: - ; GFX90A-NEXT: successors: %bb.64(0x40000000), %bb.66(0x40000000) + ; GFX90A-NEXT: bb.64.Flow13: + ; GFX90A-NEXT: successors: %bb.65(0x40000000), %bb.67(0x40000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.66, implicit $vcc + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.67, implicit $vcc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.64.bb159: - ; GFX90A-NEXT: successors: %bb.67(0x40000000), %bb.65(0x40000000) + ; GFX90A-NEXT: bb.65.bb159: + ; GFX90A-NEXT: successors: %bb.68(0x40000000), %bb.66(0x40000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr30, implicit $exec ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.67, implicit $exec + ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.68, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.65.Flow10: - ; GFX90A-NEXT: successors: %bb.66(0x80000000) + ; GFX90A-NEXT: bb.66.Flow10: + ; GFX90A-NEXT: successors: %bb.67(0x80000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $sgpr12_sgpr13 = S_ANDN2_SAVEEXEC_B64 $sgpr12_sgpr13, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.66.Flow14: + ; GFX90A-NEXT: bb.67.Flow14: ; GFX90A-NEXT: successors: %bb.8(0x80000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = COPY $exec ; GFX90A-NEXT: S_BRANCH %bb.8 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.67.bb161: - ; GFX90A-NEXT: successors: %bb.65(0x80000000) + ; GFX90A-NEXT: bb.68.bb161: + ; GFX90A-NEXT: successors: %bb.66(0x80000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr21, killed $vgpr23, implicit $exec @@ -922,10 +932,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr15, implicit $exec ; GFX90A-NEXT: DS_WRITE2_B32_gfx9 killed renamable $vgpr3, killed renamable $vgpr2, renamable $vgpr3, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3) - ; GFX90A-NEXT: S_BRANCH %bb.65 + ; GFX90A-NEXT: S_BRANCH %bb.66 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.68.bb174: - ; GFX90A-NEXT: successors: %bb.72(0x40000000), %bb.69(0x40000000) + ; GFX90A-NEXT: bb.69.bb174: + ; GFX90A-NEXT: successors: %bb.73(0x40000000), %bb.70(0x40000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr26 = V_OR_B32_e32 1, $vgpr24, implicit $exec @@ -938,17 +948,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr50 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr32, killed $sgpr12_sgpr13, implicit $exec ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr28_sgpr29, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.72, implicit $vcc + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.73, implicit $vcc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.69.Flow: - ; GFX90A-NEXT: successors: %bb.70(0x40000000), %bb.71(0x40000000) + ; GFX90A-NEXT: bb.70.Flow: + ; GFX90A-NEXT: successors: %bb.71(0x40000000), %bb.72(0x40000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.71, implicit $vcc + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.72, implicit $vcc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.70.bb186: - ; GFX90A-NEXT: successors: %bb.71(0x80000000) + ; GFX90A-NEXT: bb.71.bb186: + ; GFX90A-NEXT: successors: %bb.72(0x80000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr2_vgpr3 = V_LSHLREV_B64_e64 3, killed $vgpr2_vgpr3, implicit $exec @@ -976,15 +986,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.71.Flow9: - ; GFX90A-NEXT: successors: %bb.63(0x80000000) + ; GFX90A-NEXT: bb.72.Flow9: + ; GFX90A-NEXT: successors: %bb.64(0x80000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 0 - ; GFX90A-NEXT: S_BRANCH %bb.63 + ; GFX90A-NEXT: S_BRANCH %bb.64 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.72.bb196: - ; GFX90A-NEXT: successors: %bb.69(0x80000000) + ; GFX90A-NEXT: bb.73.bb196: + ; GFX90A-NEXT: successors: %bb.70(0x80000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 $vgpr50, killed $vgpr16, implicit $exec @@ -992,7 +1002,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr55 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr55, renamable $vgpr54_vgpr55, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_MOV_B64 0 - ; GFX90A-NEXT: S_BRANCH %bb.69 + ; GFX90A-NEXT: S_BRANCH %bb.70 bb: %i = tail call i32 @llvm.amdgcn.workitem.id.x() %i11 = icmp eq i32 %i, 0 diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll index 2ad7818bd3ca8..1ae5524568857 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll @@ -556,12 +556,12 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GCN-NEXT: s_add_pc_i64 .LBB10_7-.Lpost_addpc12 ; GCN-NEXT: .Lpost_addpc12: ; GCN-NEXT: .LBB10_4: ; %bb13 +; GCN-NEXT: s_mov_b32 s0, s8 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_sleep 0 -; GCN-NEXT: s_mov_b32 s0, s8 ; GCN-NEXT: s_sleep 0 ; GCN-NEXT: s_cbranch_execz .LBB10_5 ; GCN-NEXT: ; %bb.16: ; %bb13 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index f4b432dce8c8a..cb1a35bb8e836 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -248,20 +248,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v4, v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_mov_b32_e32 v2, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB1_1 @@ -291,22 +291,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_add_f32_e32 v4, v5, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -317,22 +317,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s20 -; GFX7-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_add_f32_e32 v4, v5, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: v_mov_b32_e32 v2, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v2, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB1_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -343,23 +343,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s20 -; GFX6-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: v_add_f32_e32 v4, v5, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_mov_b32_e32 v2, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v2, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB1_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1038,20 +1038,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v4, v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_mov_b32_e32 v2, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB4_1 @@ -1063,21 +1063,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_f32_e32 v2, v3, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_add_f32_e32 v4, v5, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1088,22 +1088,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_add_f32_e32 v4, v5, v0 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1114,22 +1114,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_add_f32_e32 v4, v5, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1140,22 +1140,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s20 -; GFX7-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_add_f32_e32 v4, v5, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: v_mov_b32_e32 v2, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v2, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB4_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1166,23 +1166,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s20 -; GFX6-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: v_add_f32_e32 v4, v5, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_mov_b32_e32 v2, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v2, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB4_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2137,20 +2137,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048 +; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_add_f64_e32 v[2:3], v[4:5], v[0:1] -; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 +; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[0:1] +; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: v_dual_mov_b32 v4, v9 :: v_dual_mov_b32 v5, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[2:5], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2178,21 +2178,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX11-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 +; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[0:1] +; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX11-NEXT: v_dual_mov_b32 v4, v9 :: v_dual_mov_b32 v5, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[2:5], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2208,23 +2208,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX10-NEXT: s_add_i32 s4, s20, 0x800 ; GFX10-NEXT: v_mov_b32_e32 v6, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 +; GFX10-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048 ; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v10, v5 -; GFX10-NEXT: v_mov_b32_e32 v9, v4 +; GFX10-NEXT: v_mov_b32_e32 v10, v3 +; GFX10-NEXT: v_mov_b32_e32 v9, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v8, v3 -; GFX10-NEXT: v_mov_b32_e32 v7, v2 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[0:1] +; GFX10-NEXT: v_mov_b32_e32 v2, v7 +; GFX10-NEXT: v_mov_b32_e32 v3, v8 +; GFX10-NEXT: v_mov_b32_e32 v4, v9 +; GFX10-NEXT: v_mov_b32_e32 v5, v10 +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v4, v7 -; GFX10-NEXT: v_mov_b32_e32 v5, v8 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB9_1 @@ -2245,25 +2245,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, s20 -; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 +; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048 ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX908-NEXT: v_mov_b32_e32 v10, v5 -; GFX908-NEXT: v_mov_b32_e32 v9, v4 -; GFX908-NEXT: v_mov_b32_e32 v8, v3 -; GFX908-NEXT: v_mov_b32_e32 v7, v2 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v10, v3 +; GFX908-NEXT: v_mov_b32_e32 v9, v2 +; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[0:1] +; GFX908-NEXT: v_mov_b32_e32 v2, v7 +; GFX908-NEXT: v_mov_b32_e32 v3, v8 +; GFX908-NEXT: v_mov_b32_e32 v4, v9 +; GFX908-NEXT: v_mov_b32_e32 v5, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v7 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v8 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2274,25 +2274,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 +; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048 ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v10, v5 -; GFX8-NEXT: v_mov_b32_e32 v9, v4 -; GFX8-NEXT: v_mov_b32_e32 v8, v3 -; GFX8-NEXT: v_mov_b32_e32 v7, v2 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v10, v3 +; GFX8-NEXT: v_mov_b32_e32 v9, v2 +; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v7 +; GFX8-NEXT: v_mov_b32_e32 v3, v8 +; GFX8-NEXT: v_mov_b32_e32 v4, v9 +; GFX8-NEXT: v_mov_b32_e32 v5, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v7 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v8 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2303,25 +2303,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 +; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048 ; GFX7-NEXT: s_add_i32 s6, s20, 0x800 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v10, v5 -; GFX7-NEXT: v_mov_b32_e32 v9, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v10, v3 +; GFX7-NEXT: v_mov_b32_e32 v9, v2 +; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, v7 +; GFX7-NEXT: v_mov_b32_e32 v3, v8 +; GFX7-NEXT: v_mov_b32_e32 v4, v9 +; GFX7-NEXT: v_mov_b32_e32 v5, v10 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v5, v8 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB9_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2332,26 +2332,26 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 +; GFX6-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048 ; GFX6-NEXT: s_add_i32 s6, s20, 0x800 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mov_b32_e32 v6, s6 ; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v10, v3 +; GFX6-NEXT: v_mov_b32_e32 v9, v2 +; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[0:1] ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, v5 -; GFX6-NEXT: v_mov_b32_e32 v9, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v2, v7 +; GFX6-NEXT: v_mov_b32_e32 v3, v8 +; GFX6-NEXT: v_mov_b32_e32 v4, v9 +; GFX6-NEXT: v_mov_b32_e32 v5, v10 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v5, v8 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB9_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3430,41 +3430,41 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, s4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen ; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3479,41 +3479,41 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, s4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-FAKE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen +; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen ; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 ; GFX12-FAKE16-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3523,7 +3523,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX942-NEXT: s_addk_i32 s16, 0x200 ; GFX942-NEXT: s_and_b32 s4, s16, -4 ; GFX942-NEXT: v_mov_b32_e32 v1, s4 -; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen ; GFX942-NEXT: s_and_b32 s4, s16, 3 ; GFX942-NEXT: s_lshl_b32 s6, s4, 3 ; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 @@ -3532,23 +3532,23 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, s6, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v2, s6, v5 ; GFX942-NEXT: v_add_f16_e32 v2, v2, v0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, s6, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, s7, v2 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX942-NEXT: v_and_or_b32 v4, v5, s7, v2 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB13_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3557,40 +3557,40 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3599,40 +3599,40 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, s4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-FAKE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen ; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 ; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX11-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3640,35 +3640,35 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v5, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_and_b32 s4, s20, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v2, v5, s[16:19], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v5 ; GFX10-NEXT: v_add_f16_e32 v1, v1, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX10-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_mov_b32_e32 v2, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3677,7 +3677,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX90A-NEXT: s_addk_i32 s20, 0x200 ; GFX90A-NEXT: s_and_b32 s4, s20, -4 ; GFX90A-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen +; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 @@ -3686,22 +3686,22 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v5 ; GFX90A-NEXT: v_add_f16_e32 v2, v2, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc +; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3709,8 +3709,8 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 ; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v5, s4 -; GFX908-NEXT: buffer_load_dword v2, v5, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v3, s4 +; GFX908-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 @@ -3719,23 +3719,23 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v5 ; GFX908-NEXT: v_add_f16_e32 v1, v1, v0 ; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1 -; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX908-NEXT: v_and_or_b32 v4, v5, s7, v1 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3743,8 +3743,8 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 ; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: buffer_load_dword v2, v5, s[16:19], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 @@ -3753,24 +3753,24 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v5 ; GFX8-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX8-NEXT: v_and_b32_e32 v3, s7, v2 +; GFX8-NEXT: v_and_b32_e32 v2, s7, v5 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1 -; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc +; GFX8-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3778,38 +3778,38 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 ; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v3, s4 +; GFX7-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s7, v5 +; GFX7-NEXT: v_add_f32_e32 v1, v1, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX7-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: v_mov_b32_e32 v2, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -3818,39 +3818,39 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 ; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX6-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s7, v5 +; GFX6-NEXT: v_add_f32_e32 v1, v1, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX6-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_mov_b32_e32 v2, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3877,28 +3877,28 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen ; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -3925,28 +3925,28 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-FAKE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen +; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen ; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX12-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 ; GFX12-FAKE16-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -3962,7 +3962,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX942-NEXT: s_addk_i32 s16, 0x200 ; GFX942-NEXT: s_and_b32 s4, s16, -4 ; GFX942-NEXT: v_mov_b32_e32 v1, s4 -; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen ; GFX942-NEXT: s_and_b32 s4, s16, 3 ; GFX942-NEXT: s_lshl_b32 s6, s4, 3 ; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 @@ -3971,18 +3971,18 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, s6, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v2, s6, v5 ; GFX942-NEXT: v_add_f16_e32 v2, v2, v0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, s6, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, s7, v2 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX942-NEXT: v_and_or_b32 v4, v5, s7, v2 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB14_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4000,28 +4000,28 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -4041,28 +4041,28 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-FAKE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen ; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX11-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 ; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX11-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -4080,25 +4080,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX10-NEXT: s_and_b32 s4, s20, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v2, v3, s[16:19], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v5 ; GFX10-NEXT: v_add_f16_e32 v1, v1, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_mov_b32_e32 v2, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB14_1 @@ -4112,7 +4112,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX90A-NEXT: s_addk_i32 s20, 0x200 ; GFX90A-NEXT: s_and_b32 s4, s20, -4 ; GFX90A-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen +; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 @@ -4121,17 +4121,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v5 ; GFX90A-NEXT: v_add_f16_e32 v2, v2, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc +; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4144,7 +4144,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX908-NEXT: s_addk_i32 s20, 0x200 ; GFX908-NEXT: s_and_b32 s4, s20, -4 ; GFX908-NEXT: v_mov_b32_e32 v3, s4 -; GFX908-NEXT: buffer_load_dword v2, v3, s[16:19], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 @@ -4153,18 +4153,18 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v5 ; GFX908-NEXT: v_add_f16_e32 v1, v1, v0 ; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1 -; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_and_or_b32 v4, v5, s7, v1 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4177,7 +4177,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_addk_i32 s20, 0x200 ; GFX8-NEXT: s_and_b32 s4, s20, -4 ; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: buffer_load_dword v2, v3, s[16:19], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 @@ -4186,19 +4186,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v5 ; GFX8-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX8-NEXT: v_and_b32_e32 v4, s7, v2 +; GFX8-NEXT: v_and_b32_e32 v2, s7, v5 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1 -; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4210,33 +4210,33 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 ; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v3, s4 +; GFX7-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s7, v5 +; GFX7-NEXT: v_add_f32_e32 v1, v1, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX7-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: v_mov_b32_e32 v2, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4248,34 +4248,34 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 ; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX6-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s7, v5 +; GFX6-NEXT: v_add_f32_e32 v1, v1, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX6-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_mov_b32_e32 v2, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4299,11 +4299,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, -4, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v11, v7 +; GFX12-TRUE16-NEXT: v_not_b32_e32 v10, v7 ; GFX12-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -4317,7 +4317,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v9, s[4:7], null offen ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-TRUE16-NEXT: ; %bb.2: @@ -4327,17 +4327,19 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v8 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 -; GFX12-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v7, v8, v10, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -4352,14 +4354,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_4 ; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -4367,7 +4368,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_3 ; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v6 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4382,11 +4383,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v6 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v9, -4, v6 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff -; GFX12-FAKE16-NEXT: v_not_b32_e32 v11, v7 +; GFX12-FAKE16-NEXT: v_not_b32_e32 v10, v7 ; GFX12-FAKE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -4400,7 +4401,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen +; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v9, s[4:7], null offen ; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-FAKE16-NEXT: ; %bb.2: @@ -4410,17 +4411,19 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v6, v4, v8 ; GFX12-FAKE16-NEXT: v_add_f16_e32 v6, v6, v5 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 -; GFX12-FAKE16-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v7, v8, v10, v6 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -4435,14 +4438,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_4 ; GFX12-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v7, v8 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -4450,7 +4452,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_3 ; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v4, v6 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4475,7 +4477,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX942-NEXT: buffer_load_dword v7, v10, s[4:7], 0 offen +; GFX942-NEXT: buffer_load_dword v6, v10, s[4:7], 0 offen ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB15_1 ; GFX942-NEXT: ; %bb.2: @@ -4485,12 +4487,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: ; =>This Loop Header: Depth=1 ; GFX942-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX942-NEXT: v_mov_b32_e32 v9, v6 +; GFX942-NEXT: v_lshrrev_b32_e32 v6, v4, v9 ; GFX942-NEXT: v_add_f16_e32 v6, v6, v5 ; GFX942-NEXT: v_lshlrev_b32_e32 v6, v4, v6 -; GFX942-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX942-NEXT: v_and_or_b32 v8, v9, v11, v6 ; GFX942-NEXT: s_mov_b64 s[8:9], exec -; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 @@ -4504,21 +4507,20 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB15_4 ; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX942-NEXT: s_mov_b64 exec, s[8:9] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v7, v8 ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB15_3 ; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v6 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -4529,11 +4531,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, -4, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v11, v7 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v10, v7 ; GFX11-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -4545,7 +4547,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v9, s[4:7], 0 offen ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 ; GFX11-TRUE16-NEXT: ; %bb.2: @@ -4555,17 +4557,19 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v8 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 -; GFX11-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v7, v8, v10, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -4579,14 +4583,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_4 ; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -4595,7 +4598,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_3 ; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v6 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -4606,11 +4609,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, -4, v6 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff -; GFX11-FAKE16-NEXT: v_not_b32_e32 v11, v7 +; GFX11-FAKE16-NEXT: v_not_b32_e32 v10, v7 ; GFX11-FAKE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -4622,7 +4625,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen +; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v9, s[4:7], 0 offen ; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_1 ; GFX11-FAKE16-NEXT: ; %bb.2: @@ -4632,17 +4635,19 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, v4, v8 ; GFX11-FAKE16-NEXT: v_add_f16_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 -; GFX11-FAKE16-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v7, v8, v10, v6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -4656,14 +4661,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc ; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_4 ; GFX11-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v8 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -4672,7 +4676,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_3 ; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v4, v6 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -4682,10 +4686,10 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: v_and_b32_e32 v4, 3, v6 -; GFX10-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX10-NEXT: v_and_b32_e32 v9, -4, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v11, v7 +; GFX10-NEXT: v_not_b32_e32 v10, v7 ; GFX10-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -4695,7 +4699,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen +; GFX10-NEXT: buffer_load_dword v6, v9, s[8:11], 0 offen ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB15_1 @@ -4705,14 +4709,15 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, v4, v8 ; GFX10-NEXT: v_add_f16_e32 v6, v6, v5 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX10-NEXT: v_mov_b32_e32 v9, v7 -; GFX10-NEXT: v_mov_b32_e32 v8, v6 +; GFX10-NEXT: v_and_or_b32 v7, v8, v10, v6 +; GFX10-NEXT: v_mov_b32_e32 v6, v7 +; GFX10-NEXT: v_mov_b32_e32 v7, v8 ; GFX10-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 @@ -4724,15 +4729,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB15_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 -; GFX10-NEXT: v_mov_b32_e32 v7, v8 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 @@ -4741,7 +4745,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: s_cbranch_execnz .LBB15_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v4, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -4765,7 +4769,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen +; GFX90A-NEXT: buffer_load_dword v6, v10, s[8:11], 0 offen ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 ; GFX90A-NEXT: ; %bb.2: @@ -4775,12 +4779,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v9, v6 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v4, v9 ; GFX90A-NEXT: v_add_f16_e32 v6, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, v4, v6 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX90A-NEXT: v_and_or_b32 v8, v9, v11, v6 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] ; GFX90A-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -4792,33 +4797,32 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB15_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v8 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB15_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v6 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX908-NEXT: v_and_b32_e32 v10, -4, v4 +; GFX908-NEXT: v_and_b32_e32 v9, -4, v4 ; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX908-NEXT: s_mov_b32 s4, 0xffff ; GFX908-NEXT: v_lshlrev_b32_e64 v6, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v11, v6 +; GFX908-NEXT: v_not_b32_e32 v10, v6 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -4830,7 +4834,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen +; GFX908-NEXT: buffer_load_dword v6, v9, s[8:11], 0 offen ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: ; %bb.2: @@ -4840,13 +4844,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX908-NEXT: v_mov_b32_e32 v8, v6 +; GFX908-NEXT: v_lshrrev_b32_e32 v6, v4, v8 ; GFX908-NEXT: v_add_f16_e32 v6, v6, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, v4, v6 -; GFX908-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX908-NEXT: v_mov_b32_e32 v9, v7 +; GFX908-NEXT: v_and_or_b32 v7, v8, v10, v6 +; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v8, v6 +; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -4858,33 +4863,32 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB15_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB15_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v4, v6 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 -; GFX8-NEXT: v_and_b32_e32 v10, -4, v4 +; GFX8-NEXT: v_and_b32_e32 v9, -4, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX8-NEXT: s_mov_b32 s4, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e64 v6, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v11, v6 +; GFX8-NEXT: v_not_b32_e32 v10, v6 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -4896,7 +4900,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen +; GFX8-NEXT: buffer_load_dword v6, v9, s[8:11], 0 offen ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB15_1 ; GFX8-NEXT: ; %bb.2: @@ -4906,14 +4910,15 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX8-NEXT: v_mov_b32_e32 v8, v6 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, v4, v8 ; GFX8-NEXT: v_add_f16_e32 v6, v6, v5 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, v4, v6 -; GFX8-NEXT: v_and_b32_e32 v8, v7, v11 -; GFX8-NEXT: v_or_b32_e32 v6, v8, v6 -; GFX8-NEXT: v_mov_b32_e32 v9, v7 +; GFX8-NEXT: v_and_b32_e32 v7, v8, v10 +; GFX8-NEXT: v_or_b32_e32 v7, v7, v6 +; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v8, v6 +; GFX8-NEXT: v_mov_b32_e32 v7, v8 ; GFX8-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -4925,21 +4930,20 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB15_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v8 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB15_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v4, v6 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -4961,18 +4965,19 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB15_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v5 ; GFX7-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 @@ -5002,7 +5007,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB15_3 @@ -5031,18 +5035,20 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX6-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB15_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v5 ; GFX6-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB15_4 Depth 2 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 @@ -5072,7 +5078,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB15_3 @@ -5103,53 +5108,53 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, s4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen ; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX12-TRUE16-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v2.l ; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5161,54 +5166,54 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen ; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-FAKE16-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f32_e32 v1, v1, v0 +; GFX12-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5217,36 +5222,36 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_addk_i32 s16, 0x200 ; GFX942-NEXT: s_and_b32 s4, s16, -4 -; GFX942-NEXT: v_mov_b32_e32 v4, s4 -; GFX942-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen ; GFX942-NEXT: s_and_b32 s4, s16, 3 ; GFX942-NEXT: s_lshl_b32 s6, s4, 3 ; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX942-NEXT: s_not_b32 s7, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX942-NEXT: s_movk_i32 s8, 0x7fff ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX942-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX942-NEXT: v_add3_u32 v2, v2, v0, s8 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX942-NEXT: v_add_f32_e32 v2, v2, v0 +; GFX942-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v3, v3, v2, s8 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] -; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v5, s7, v2 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5] +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5260,146 +5265,146 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v2.l ; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3 -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB16_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 ; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen ; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, v1, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB16_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_and_b32 s4, s20, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v1, v1, v0 +; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_mov_b32_e32 v2, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -5407,34 +5412,34 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 ; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v0 +; GFX90A-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v3, v3, v2, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5447,40 +5452,40 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 ; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v3, s4 +; GFX908-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v2, v2, v0, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v1, v1, v0 +; GFX908-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX908-NEXT: v_add3_u32 v2, v2, v1, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX908-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v4, v5, s7, v1 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -5488,42 +5493,42 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 ; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_lshrrev_b32_sdwa v1, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v1, v1, v0 +; GFX8-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v4, s7, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -5531,38 +5536,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 ; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v3, s4 +; GFX7-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX7-NEXT: s_and_b32 s4, s20, 3 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_add_f32_e32 v1, v1, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s7, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX7-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: v_mov_b32_e32 v2, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -5571,39 +5576,39 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 ; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX6-NEXT: s_and_b32 s4, s20, 3 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_add_f32_e32 v1, v1, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s7, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX6-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_mov_b32_e32 v2, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -5630,40 +5635,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen ; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX12-TRUE16-NEXT: v_add_f32_e32 v1, v1, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v4.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v2.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -5681,47 +5686,47 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen ; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-FAKE16-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f32_e32 v1, v1, v0 +; GFX12-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -5736,36 +5741,36 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_addk_i32 s16, 0x200 ; GFX942-NEXT: s_and_b32 s4, s16, -4 -; GFX942-NEXT: v_mov_b32_e32 v2, s4 -; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen ; GFX942-NEXT: s_and_b32 s4, s16, 3 ; GFX942-NEXT: s_lshl_b32 s6, s4, 3 ; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX942-NEXT: s_not_b32 s7, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX942-NEXT: s_movk_i32 s8, 0x7fff ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX942-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX942-NEXT: v_add3_u32 v4, v4, v0, s8 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX942-NEXT: v_add_f32_e32 v2, v2, v0 +; GFX942-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v3, v3, v2, s8 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v5, s7, v2 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5] +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB17_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5783,40 +5788,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v2.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v4 -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -5829,45 +5834,45 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 ; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen ; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, v1, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -5880,36 +5885,36 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_and_b32 s4, s20, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v1, v1, v0 +; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_mov_b32_e32 v2, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 @@ -5922,34 +5927,34 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 ; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v0 +; GFX90A-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v3, v3, v2, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5961,35 +5966,35 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 ; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v3, s4 +; GFX908-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v4, v4, v0, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v1, v1, v0 +; GFX908-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX908-NEXT: v_add3_u32 v2, v2, v1, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX908-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v4, v5, s7, v1 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6001,37 +6006,37 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 ; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshrrev_b32_sdwa v1, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v1, v1, v0 +; GFX8-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v4, s7, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6043,33 +6048,33 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 ; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v3, s4 +; GFX7-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX7-NEXT: s_and_b32 s4, s20, 3 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_add_f32_e32 v1, v1, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s7, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX7-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: v_mov_b32_e32 v2, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6081,34 +6086,34 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 ; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX6-NEXT: s_and_b32 s4, s20, 3 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_add_f32_e32 v1, v1, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s7, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX6-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_mov_b32_e32 v2, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6132,11 +6137,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, -4, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v11, v7 +; GFX12-TRUE16-NEXT: v_not_b32_e32 v10, v7 ; GFX12-TRUE16-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -6150,7 +6155,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v9, s[4:7], null offen ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-TRUE16-NEXT: ; %bb.2: @@ -6160,28 +6165,31 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.h, v5.l +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v5.l ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v8 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f32_e32 v6, v6, v8 -; GFX12-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v6, v6, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v11, v6, 16, 1 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v11, v11, v6, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v11, v12, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v8.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v7.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX12-TRUE16-NEXT: v_and_or_b32 v7, v8, v10, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-TRUE16-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -6196,14 +6204,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB18_4 ; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -6211,7 +6218,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB18_3 ; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v6 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6244,7 +6251,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-FAKE16-NEXT: buffer_load_b32 v4, v8, s[4:7], null offen ; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-FAKE16-NEXT: ; %bb.2: @@ -6255,25 +6262,26 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_add_f32_e32 v4, v4, v10 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX12-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-FAKE16-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 @@ -6297,7 +6305,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -6330,7 +6337,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX942-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen +; GFX942-NEXT: buffer_load_dword v4, v9, s[4:7], 0 offen ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB18_1 ; GFX942-NEXT: ; %bb.2: @@ -6342,6 +6349,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX942-NEXT: ; =>This Loop Header: Depth=1 ; GFX942-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v4 ; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_mov_b64 s[8:9], exec ; GFX942-NEXT: v_add_f32_e32 v4, v4, v11 @@ -6375,7 +6383,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v7, v4 ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB18_3 @@ -6392,11 +6399,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, -4, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v11, v7 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v10, v7 ; GFX11-TRUE16-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -6408,7 +6415,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v9, s[4:7], 0 offen ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB18_1 ; GFX11-TRUE16-NEXT: ; %bb.2: @@ -6419,27 +6426,30 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v5.l ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v8 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v8 -; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v6, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v11, v12, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v7.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX11-TRUE16-NEXT: v_and_or_b32 v7, v8, v10, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-TRUE16-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -6453,14 +6463,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB18_4 ; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -6470,7 +6479,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v6 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -6497,7 +6506,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen +; GFX11-FAKE16-NEXT: buffer_load_b32 v4, v8, s[4:7], 0 offen ; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB18_1 ; GFX11-FAKE16-NEXT: ; %bb.2: @@ -6509,24 +6518,25 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, v4, v10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 ; GFX11-FAKE16-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 @@ -6549,7 +6559,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -6582,7 +6591,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX10-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB18_1 @@ -6593,9 +6602,10 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_add_f32_e32 v4, v4, v10 ; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v4 @@ -6625,7 +6635,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 @@ -6658,7 +6667,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen +; GFX90A-NEXT: buffer_load_dword v4, v9, s[8:11], 0 offen ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: ; %bb.2: @@ -6670,6 +6679,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_add_f32_e32 v4, v4, v11 ; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 @@ -6700,7 +6710,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 @@ -6730,7 +6739,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX908-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB18_1 ; GFX908-NEXT: ; %bb.2: @@ -6742,6 +6751,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_add_f32_e32 v4, v4, v10 ; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 @@ -6773,7 +6783,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB18_3 @@ -6803,7 +6812,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX8-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB18_1 ; GFX8-NEXT: ; %bb.2: @@ -6814,6 +6823,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_add_f32_e32 v4, v4, v10 ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 @@ -6847,7 +6857,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB18_3 @@ -6875,18 +6884,19 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX7-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_add_f32_e32 v4, v4, v10 @@ -6916,7 +6926,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB18_3 @@ -6945,18 +6954,20 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX6-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX6-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_add_f32_e32 v4, v4, v10 @@ -6986,7 +6997,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB18_3 @@ -7287,21 +7297,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v4, v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -7317,20 +7326,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_pk_add_f16 v4, v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_mov_b32_e32 v2, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB20_1 @@ -7360,24 +7369,24 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_add_f16_sdwa v1, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v2, v5, v0 +; GFX8-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8279,21 +8288,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v4, v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -8309,20 +8317,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_pk_add_f16 v4, v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_mov_b32_e32 v2, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB23_1 @@ -8334,21 +8342,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_add_f16 v2, v3, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_pk_add_f16 v4, v5, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8359,22 +8367,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_pk_add_f16 v4, v5, v0 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8385,24 +8393,24 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_add_f16_sdwa v1, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v2, v5, v0 +; GFX8-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8806,21 +8814,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v4, v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -8836,20 +8843,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_pk_add_f16 v4, v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_mov_b32_e32 v2, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB25_1 @@ -8861,21 +8868,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_add_f16 v2, v3, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_pk_add_f16 v4, v5, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8886,22 +8893,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_pk_add_f16 v4, v5, v0 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB25_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8912,24 +8919,24 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_add_f16_sdwa v1, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v2, v5, v0 +; GFX8-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB25_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9495,40 +9502,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 -; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_add_i32 s4, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[6:7], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX942-NEXT: s_movk_i32 s8, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX942-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX942-NEXT: s_mov_b32 s9, 0x7060302 ; GFX942-NEXT: v_mov_b32_e32 v4, s4 ; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX942-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX942-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX942-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; GFX942-NEXT: v_add_f32_e32 v2, v2, v1 +; GFX942-NEXT: v_add_f32_e32 v3, v3, v0 +; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v5, v5, v2, s8 +; GFX942-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] -; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX942-NEXT: v_perm_b32 v6, v3, v2, s9 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[6:7] +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v7 ; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX942-NEXT: s_cbranch_execnz .LBB27_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9538,10 +9545,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s16 ; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 @@ -9549,34 +9557,35 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v1 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_add_f32 v0, v0, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, v2, v3 :: v_dual_lshlrev_b32 v1, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v1, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6 ; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -9589,10 +9598,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s16 ; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 @@ -9600,32 +9610,33 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, v2, v0 :: v_dual_lshlrev_b32 v1, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v1, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, v5, v8, s4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v2, v1, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -9640,38 +9651,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 ; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, s4 ; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v0 +; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v8, s4 +; GFX10-NEXT: v_perm_b32 v5, v2, v1, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v2, v6 +; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB27_1 @@ -9683,39 +9694,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v1 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v0 +; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v6, v3, v2, s9 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9729,37 +9740,37 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s4, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 ; GFX908-NEXT: v_mov_b32_e32 v6, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v0 +; GFX908-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v5, v5, v1, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v2, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX908-NEXT: v_cndmask_b32_e64 v1, v5, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v5, v2, v1, s9 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v2, v6 +; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9773,38 +9784,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v0 +; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, v6 +; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11102,40 +11113,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 -; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_add_i32 s4, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[6:7], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX942-NEXT: s_movk_i32 s8, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX942-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX942-NEXT: s_mov_b32 s9, 0x7060302 ; GFX942-NEXT: v_mov_b32_e32 v4, s4 ; GFX942-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX942-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX942-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX942-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; GFX942-NEXT: v_add_f32_e32 v2, v2, v1 +; GFX942-NEXT: v_add_f32_e32 v3, v3, v0 +; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v5, v5, v2, s8 +; GFX942-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] -; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX942-NEXT: v_perm_b32 v6, v3, v2, s9 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[6:7] +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v7 ; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX942-NEXT: s_cbranch_execnz .LBB30_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11145,10 +11156,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s16 ; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 @@ -11156,34 +11168,35 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v1 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_add_f32 v0, v0, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, v2, v3 :: v_dual_lshlrev_b32 v1, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v1, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6 ; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -11196,10 +11209,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s16 ; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 @@ -11207,32 +11221,33 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, v2, v0 :: v_dual_lshlrev_b32 v1, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v1, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, v5, v8, s4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v2, v1, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -11247,38 +11262,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 ; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, s4 ; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v0 +; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v8, s4 +; GFX10-NEXT: v_perm_b32 v5, v2, v1, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v2, v6 +; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB30_1 @@ -11290,39 +11305,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v1 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v0 +; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v6, v3, v2, s9 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11336,37 +11351,37 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s4, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 ; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 ; GFX908-NEXT: v_mov_b32_e32 v6, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v0 +; GFX908-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v5, v5, v1, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v2, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX908-NEXT: v_cndmask_b32_e64 v1, v5, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v5, v2, v1, s9 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v2, v6 +; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11380,38 +11395,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v0 +; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, v6 +; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11965,40 +11980,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 -; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_add_i32 s4, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[6:7], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX942-NEXT: s_movk_i32 s8, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX942-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX942-NEXT: s_mov_b32 s9, 0x7060302 ; GFX942-NEXT: v_mov_b32_e32 v4, s4 ; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX942-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX942-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX942-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; GFX942-NEXT: v_add_f32_e32 v2, v2, v1 +; GFX942-NEXT: v_add_f32_e32 v3, v3, v0 +; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v5, v5, v2, s8 +; GFX942-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] -; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX942-NEXT: v_perm_b32 v6, v3, v2, s9 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[6:7] +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v7 ; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX942-NEXT: s_cbranch_execnz .LBB32_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12008,10 +12023,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s16 ; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 @@ -12019,34 +12035,35 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v1 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_add_f32 v0, v0, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, v2, v3 :: v_dual_lshlrev_b32 v1, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v1, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6 ; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -12059,10 +12076,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s16 ; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 @@ -12070,32 +12088,33 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, v2, v0 :: v_dual_lshlrev_b32 v1, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v1, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, v5, v8, s4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v2, v1, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -12110,38 +12129,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 ; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, s4 ; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v0 +; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v8, s4 +; GFX10-NEXT: v_perm_b32 v5, v2, v1, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v2, v6 +; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB32_1 @@ -12153,39 +12172,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v1 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v0 +; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v6, v3, v2, s9 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12199,37 +12218,37 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s4, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 ; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 ; GFX908-NEXT: v_mov_b32_e32 v6, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v0 +; GFX908-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v5, v5, v1, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v2, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX908-NEXT: v_cndmask_b32_e64 v1, v5, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v5, v2, v1, s9 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v2, v6 +; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB32_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12243,38 +12262,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v0 +; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, v6 +; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB32_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12391,40 +12410,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 -; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_add_i32 s4, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[6:7], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX942-NEXT: s_movk_i32 s8, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX942-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX942-NEXT: s_mov_b32 s9, 0x7060302 ; GFX942-NEXT: v_mov_b32_e32 v4, s4 ; GFX942-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX942-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX942-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX942-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; GFX942-NEXT: v_add_f32_e32 v2, v2, v1 +; GFX942-NEXT: v_add_f32_e32 v3, v3, v0 +; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v5, v5, v2, s8 +; GFX942-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] -; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX942-NEXT: v_perm_b32 v6, v3, v2, s9 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[6:7] +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v7 ; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX942-NEXT: s_cbranch_execnz .LBB33_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12434,10 +12453,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s16 ; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 @@ -12445,34 +12465,35 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v1 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_add_f32 v0, v0, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, v2, v3 :: v_dual_lshlrev_b32 v1, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v1, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6 ; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -12485,10 +12506,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s16 ; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 @@ -12496,32 +12518,33 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, v2, v0 :: v_dual_lshlrev_b32 v1, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v1, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, v5, v8, s4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v2, v1, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -12536,38 +12559,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 ; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, s4 ; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v0 +; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v8, s4 +; GFX10-NEXT: v_perm_b32 v5, v2, v1, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v2, v6 +; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB33_1 @@ -12579,39 +12602,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v1 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v0 +; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v6, v3, v2, s9 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12625,37 +12648,37 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s4, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 ; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 ; GFX908-NEXT: v_mov_b32_e32 v6, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v0 +; GFX908-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v5, v5, v1, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v2, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX908-NEXT: v_cndmask_b32_e64 v1, v5, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v5, v2, v1, s9 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v2, v6 +; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12669,38 +12692,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v0 +; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, v6 +; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index 6f1675edbe58a..9a2998c74dc82 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -211,24 +211,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 -; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_add_i32 s6, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX942-NEXT: v_mov_b32_e32 v1, s6 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX942-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -261,23 +261,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -291,21 +291,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX908-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f32_e32 v1, v5, v5 +; GFX908-NEXT: v_max_f32_e32 v4, v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -319,21 +319,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX8-NEXT: v_max_f32_e32 v4, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1396,7 +1396,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, s16 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 @@ -1405,17 +1405,18 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[9:10], v[9:10] +; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[2:3], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: v_dual_mov_b32 v4, v9 :: v_dual_mov_b32 v5, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[2:5], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1439,7 +1440,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, s16 -; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: s_add_i32 s4, s16, 0x800 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v6, s4 @@ -1448,18 +1449,19 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[9:10], v[9:10] +; GFX11-NEXT: v_max_f64 v[7:8], v[2:3], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX11-NEXT: v_dual_mov_b32 v4, v9 :: v_dual_mov_b32 v5, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[2:5], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1493,26 +1495,26 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, s20 ; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v10, v3 ; GFX908-NEXT: v_mov_b32_e32 v9, v2 -; GFX908-NEXT: v_mov_b32_e32 v8, v1 -; GFX908-NEXT: v_mov_b32_e32 v7, v0 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f64 v[2:3], v[9:10], v[9:10] +; GFX908-NEXT: v_max_f64 v[7:8], v[2:3], v[0:1] +; GFX908-NEXT: v_mov_b32_e32 v2, v7 +; GFX908-NEXT: v_mov_b32_e32 v3, v8 +; GFX908-NEXT: v_mov_b32_e32 v4, v9 +; GFX908-NEXT: v_mov_b32_e32 v5, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v2, v7 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v8 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1524,26 +1526,26 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s20 ; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v10, v3 ; GFX8-NEXT: v_mov_b32_e32 v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, v0 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_max_f64 v[2:3], v[9:10], v[9:10] +; GFX8-NEXT: v_max_f64 v[7:8], v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v7 +; GFX8-NEXT: v_mov_b32_e32 v3, v8 +; GFX8-NEXT: v_mov_b32_e32 v4, v9 +; GFX8-NEXT: v_mov_b32_e32 v5, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v7 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v8 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2499,42 +2501,43 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, s4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen ; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2546,46 +2549,47 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v0, v0 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen ; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v5 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2594,30 +2598,30 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_addk_i32 s16, 0x200 ; GFX942-NEXT: s_and_b32 s4, s16, -4 -; GFX942-NEXT: v_mov_b32_e32 v4, s4 -; GFX942-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen ; GFX942-NEXT: s_and_b32 s4, s16, 3 ; GFX942-NEXT: s_lshl_b32 s6, s4, 3 ; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX942-NEXT: s_not_b32 s7, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f16_e32 v5, v0, v0 +; GFX942-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX942-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX942-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX942-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v2, s6, v5 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, s6, v2 +; GFX942-NEXT: v_and_or_b32 v4, v5, s7, v2 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2632,125 +2636,127 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v0, v0 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 ; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen ; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_max_f16_e32 v5, v0, v0 +; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_and_b32 s4, s20, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v5 +; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX10-NEXT: v_max_f16_e32 v1, v1, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_mov_b32_e32 v2, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2758,29 +2764,29 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 ; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v5, v0, v0 +; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX90A-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v5 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2793,35 +2799,35 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 ; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v3, s4 +; GFX908-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v5, v0, v0 +; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX908-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX908-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX908-NEXT: v_max_f16_e32 v1, v1, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX908-NEXT: v_and_or_b32 v4, v5, s7, v1 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2829,36 +2835,36 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 ; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v5, v0, v0 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v0 +; GFX8-NEXT: v_and_b32_e32 v2, s7, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2866,38 +2872,38 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 ; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v3, s4 +; GFX7-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s7, v5 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX7-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: v_mov_b32_e32 v2, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -2906,39 +2912,39 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 ; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX6-NEXT: v_max_f32_e32 v0, v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s7, v5 +; GFX6-NEXT: v_max_f32_e32 v1, v1, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX6-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_mov_b32_e32 v2, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2966,29 +2972,30 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen ; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -3006,39 +3013,40 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v0, v0 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen ; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v3 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -3053,30 +3061,30 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_addk_i32 s16, 0x200 ; GFX942-NEXT: s_and_b32 s4, s16, -4 -; GFX942-NEXT: v_mov_b32_e32 v2, s4 -; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen ; GFX942-NEXT: s_and_b32 s4, s16, 3 ; GFX942-NEXT: s_lshl_b32 s6, s4, 3 ; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX942-NEXT: s_not_b32 s7, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX942-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX942-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX942-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v2, s6, v5 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, s6, v2 +; GFX942-NEXT: v_and_or_b32 v4, v5, s7, v2 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3095,30 +3103,31 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -3131,38 +3140,39 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 ; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen ; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -3175,32 +3185,32 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_and_b32 s4, s20, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v5 +; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX10-NEXT: v_max_f16_e32 v1, v1, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_mov_b32_e32 v2, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB11_1 @@ -3213,29 +3223,29 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 ; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX90A-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v5 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3247,30 +3257,30 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 ; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v3, s4 +; GFX908-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX908-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX908-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX908-NEXT: v_max_f16_e32 v1, v1, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX908-NEXT: v_and_or_b32 v4, v5, s7, v1 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3282,31 +3292,31 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 ; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v0 +; GFX8-NEXT: v_and_b32_e32 v2, s7, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3318,33 +3328,33 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 ; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v3, s4 +; GFX7-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s7, v5 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX7-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: v_mov_b32_e32 v2, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3356,34 +3366,34 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 ; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX6-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s7, v5 +; GFX6-NEXT: v_max_f32_e32 v1, v1, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX6-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_mov_b32_e32 v2, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3436,18 +3446,21 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v8 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v4.h, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v4.h, v4.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v7, v8, v11, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3462,14 +3475,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[7:8], v10, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v10, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_4 ; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v7, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -3477,7 +3489,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_3 ; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v7 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v6 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3510,7 +3522,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-FAKE16-NEXT: buffer_load_b32 v4, v8, s[4:7], null offen ; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-FAKE16-NEXT: ; %bb.2: @@ -3521,17 +3533,19 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v10 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v10 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 @@ -3555,7 +3569,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -3588,7 +3601,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX942-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen +; GFX942-NEXT: buffer_load_dword v4, v9, s[4:7], 0 offen ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 ; GFX942-NEXT: ; %bb.2: @@ -3599,6 +3612,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: ; =>This Loop Header: Depth=1 ; GFX942-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v4 ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v8, v7 ; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX942-NEXT: v_max_f16_e32 v4, v4, v11 @@ -3627,7 +3641,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v7, v4 ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB12_3 @@ -3671,18 +3684,21 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v8 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v4.h, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v4.h, v4.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v7, v8, v11, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3696,14 +3712,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[7:8], v10, s[4:7], 0 offen glc +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v10, s[4:7], 0 offen glc ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_4 ; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v7, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -3712,7 +3727,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_3 ; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v6 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -3739,7 +3754,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen +; GFX11-FAKE16-NEXT: buffer_load_b32 v4, v8, s[4:7], 0 offen ; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX11-FAKE16-NEXT: ; %bb.2: @@ -3750,17 +3765,19 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v10 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v10 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 ; GFX11-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 @@ -3783,7 +3800,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -3815,7 +3831,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX10-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB12_1 @@ -3826,9 +3842,10 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX10-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX10-NEXT: v_max_f16_e32 v4, v4, v10 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -3854,7 +3871,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 @@ -3887,7 +3903,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen +; GFX90A-NEXT: buffer_load_dword v4, v9, s[8:11], 0 offen ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: @@ -3898,6 +3914,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v8, v7 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v11 @@ -3924,7 +3941,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_3 @@ -3954,7 +3970,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX908-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 ; GFX908-NEXT: ; %bb.2: @@ -3965,6 +3981,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX908-NEXT: v_max_f16_e32 v4, v4, v10 @@ -3992,7 +4009,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB12_3 @@ -4022,7 +4038,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX8-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: @@ -4033,6 +4049,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX8-NEXT: v_max_f16_e32 v4, v4, v10 @@ -4061,7 +4078,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB12_3 @@ -4089,18 +4105,19 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v5 ; GFX7-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 @@ -4130,7 +4147,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB12_3 @@ -4159,18 +4175,20 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX6-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v5 ; GFX6-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB12_4 Depth 2 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 @@ -4200,7 +4218,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB12_3 @@ -4231,53 +4248,53 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, s4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen ; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v1, v1, v3 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v1, v1, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v2.l ; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4289,54 +4306,54 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen ; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v0, v0, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v1, v1, v0 +; GFX12-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4345,36 +4362,36 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_addk_i32 s16, 0x200 ; GFX942-NEXT: s_and_b32 s4, s16, -4 -; GFX942-NEXT: v_mov_b32_e32 v4, s4 -; GFX942-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen ; GFX942-NEXT: s_and_b32 s4, s16, 3 ; GFX942-NEXT: s_lshl_b32 s6, s4, 3 ; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX942-NEXT: s_not_b32 s7, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX942-NEXT: s_movk_i32 s8, 0x7fff ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_max_f32_e32 v0, v0, v5 -; GFX942-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX942-NEXT: v_add3_u32 v2, v2, v0, s8 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v0 +; GFX942-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v3, v3, v2, s8 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] -; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v5, s7, v2 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5] +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB13_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4388,146 +4405,146 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v2.l ; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3 -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 ; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen ; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v0, v0, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v1, v1, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_and_b32 s4, s20, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f32_e32 v0, v0, v5 -; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_max_f32_e32 v1, v1, v0 +; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_mov_b32_e32 v2, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4535,34 +4552,34 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 ; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v5 -; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v0 +; GFX90A-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v3, v3, v2, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4575,40 +4592,40 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 ; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v3, s4 +; GFX908-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_max_f32_e32 v0, v0, v5 -; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v2, v2, v0, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_max_f32_e32 v1, v1, v0 +; GFX908-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX908-NEXT: v_add3_u32 v2, v2, v1, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX908-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v4, v5, s7, v1 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4616,42 +4633,42 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 ; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f32_e32 v3, v3, v5 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_lshrrev_b32_sdwa v1, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f32_e32 v1, v1, v0 +; GFX8-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v4, s7, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4659,39 +4676,39 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 ; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v3, s4 +; GFX7-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX7-NEXT: s_and_b32 s4, s20, 3 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s7, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX7-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: v_mov_b32_e32 v2, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -4700,40 +4717,40 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 ; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX6-NEXT: s_and_b32 s4, s20, 3 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_max_f32_e32 v0, v0, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_max_f32_e32 v1, v1, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s7, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX6-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_mov_b32_e32 v2, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -4760,40 +4777,40 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen ; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v1, v1, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v4.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v1, v1, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v2.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -4811,47 +4828,47 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen ; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX12-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v0, v0, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v1, v1, v0 +; GFX12-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -4866,36 +4883,36 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_addk_i32 s16, 0x200 ; GFX942-NEXT: s_and_b32 s4, s16, -4 -; GFX942-NEXT: v_mov_b32_e32 v2, s4 -; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen ; GFX942-NEXT: s_and_b32 s4, s16, 3 ; GFX942-NEXT: s_lshl_b32 s6, s4, 3 ; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX942-NEXT: s_not_b32 s7, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX942-NEXT: s_movk_i32 s8, 0x7fff ; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX942-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX942-NEXT: v_add3_u32 v4, v4, v0, s8 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v0 +; GFX942-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v3, v3, v2, s8 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v5, s7, v2 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5] +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB14_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4913,40 +4930,40 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v1, v1, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v2.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v4 -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -4959,45 +4976,45 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 ; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen ; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v1, v1, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -5010,36 +5027,36 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_and_b32 s4, s20, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_max_f32_e32 v1, v1, v0 +; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_mov_b32_e32 v2, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB14_1 @@ -5052,34 +5069,34 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 ; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v0 +; GFX90A-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v3, v3, v2, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5091,35 +5108,35 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 ; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v3, s4 +; GFX908-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v4, v4, v0, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_max_f32_e32 v1, v1, v0 +; GFX908-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX908-NEXT: v_add3_u32 v2, v2, v1, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX908-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v4, v5, s7, v1 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5131,37 +5148,37 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 ; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshrrev_b32_sdwa v1, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f32_e32 v1, v1, v0 +; GFX8-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v4, s7, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5173,34 +5190,34 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 ; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v3, s4 +; GFX7-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX7-NEXT: s_and_b32 s4, s20, 3 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s7, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX7-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: v_mov_b32_e32 v2, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5212,35 +5229,35 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 ; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX6-NEXT: s_and_b32 s4, s20, 3 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_max_f32_e32 v1, v1, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s7, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX6-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_mov_b32_e32 v2, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5264,11 +5281,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, -4, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v11, v7 +; GFX12-TRUE16-NEXT: v_not_b32_e32 v10, v7 ; GFX12-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -5282,7 +5299,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v9, s[4:7], null offen ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-TRUE16-NEXT: ; %bb.2: @@ -5292,28 +5309,31 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.h, v5.l +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v5.l ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v8 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v6, v6, v8 -; GFX12-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v6, v6, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v11, v6, 16, 1 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v11, v11, v6, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v11, v12, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v8.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v7.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX12-TRUE16-NEXT: v_and_or_b32 v7, v8, v10, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -5328,14 +5348,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_4 ; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -5343,7 +5362,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_3 ; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v6 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5376,7 +5395,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-FAKE16-NEXT: buffer_load_b32 v4, v8, s[4:7], null offen ; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-FAKE16-NEXT: ; %bb.2: @@ -5387,25 +5406,26 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, v4, v10 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX12-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 @@ -5429,7 +5449,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -5462,7 +5481,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX942-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen +; GFX942-NEXT: buffer_load_dword v4, v9, s[4:7], 0 offen ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB15_1 ; GFX942-NEXT: ; %bb.2: @@ -5474,6 +5493,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX942-NEXT: ; =>This Loop Header: Depth=1 ; GFX942-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v4 ; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_mov_b64 s[8:9], exec ; GFX942-NEXT: v_max_f32_e32 v4, v4, v11 @@ -5507,7 +5527,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v7, v4 ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB15_3 @@ -5524,11 +5543,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, -4, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v11, v7 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v10, v7 ; GFX11-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -5540,7 +5559,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v9, s[4:7], 0 offen ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 ; GFX11-TRUE16-NEXT: ; %bb.2: @@ -5551,27 +5570,30 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v5.l ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v8 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f32_e32 v6, v6, v8 -; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v6, v6, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v6, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v11, v12, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v7.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX11-TRUE16-NEXT: v_and_or_b32 v7, v8, v10, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -5585,14 +5607,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_4 ; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -5602,7 +5623,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v6 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -5629,7 +5650,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen +; GFX11-FAKE16-NEXT: buffer_load_b32 v4, v8, s[4:7], 0 offen ; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_1 ; GFX11-FAKE16-NEXT: ; %bb.2: @@ -5641,24 +5662,25 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, v4, v10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 ; GFX11-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 @@ -5681,7 +5703,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -5714,7 +5735,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX10-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB15_1 @@ -5725,9 +5746,10 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_max_f32_e32 v4, v4, v10 ; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v4 @@ -5757,7 +5779,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 @@ -5790,7 +5811,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen +; GFX90A-NEXT: buffer_load_dword v4, v9, s[8:11], 0 offen ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 ; GFX90A-NEXT: ; %bb.2: @@ -5802,6 +5823,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_max_f32_e32 v4, v4, v11 ; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 @@ -5832,7 +5854,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB15_3 @@ -5862,7 +5883,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX908-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: ; %bb.2: @@ -5874,6 +5895,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_max_f32_e32 v4, v4, v10 ; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 @@ -5905,7 +5927,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB15_3 @@ -5935,7 +5956,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX8-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB15_1 ; GFX8-NEXT: ; %bb.2: @@ -5946,6 +5967,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_max_f32_e32 v4, v4, v10 ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 @@ -5979,7 +6001,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB15_3 @@ -6007,18 +6028,19 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB15_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX7-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -6049,7 +6071,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB15_3 @@ -6078,18 +6099,20 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX6-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB15_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX6-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB15_4 Depth 2 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -6120,7 +6143,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB15_3 @@ -6470,7 +6492,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 -; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0 +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 @@ -6478,16 +6500,17 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1 +; GFX12-NEXT: v_mov_b32_e32 v5, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v2 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_pk_max_num_f16 v1, v5, v5 +; GFX12-NEXT: v_pk_max_num_f16 v4, v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -6501,25 +6524,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 -; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_add_i32 s6, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX942-NEXT: v_mov_b32_e32 v1, s6 ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_pk_max_f16 v2, v5, v5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5] +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB17_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6531,24 +6554,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s16 ; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: v_pk_max_f16 v2, v0, v0 +; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX11-NEXT: v_mov_b32_e32 v5, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-NEXT: v_pk_max_f16 v1, v5, v5 +; GFX11-NEXT: v_pk_max_f16 v4, v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -6562,24 +6586,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 ; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_pk_max_f16 v2, v0, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_max_f16 v1, v5, v5 +; GFX10-NEXT: v_pk_max_f16 v4, v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_mov_b32_e32 v2, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 @@ -6591,23 +6615,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX90A-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_pk_max_f16 v2, v5, v5 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6621,21 +6645,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v0, v0 +; GFX908-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX908-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_pk_max_f16 v1, v5, v5 +; GFX908-NEXT: v_pk_max_f16 v4, v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6649,25 +6673,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX8-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v1, v1 -; GFX8-NEXT: v_max_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_max_f16_sdwa v1, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, v6, v6 +; GFX8-NEXT: v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v2, v2, v0 +; GFX8-NEXT: v_or_b32_e32 v5, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, v6 +; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7933,43 +7957,45 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-TRUE16-NEXT: s_add_co_i32 s4, s16, 0x400 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 ; GFX12-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX12-TRUE16-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v1 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v5, v5, v2 :: v_dual_max_num_f32 v0, v0, v3 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v2, v2, v3 :: v_dual_lshlrev_b32 v1, 16, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v1, v1, v0 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v1, v2 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6 ; GFX12-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -7986,42 +8012,44 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-FAKE16-NEXT: s_add_co_i32 s4, s16, 0x400 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX12-FAKE16-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v5, v5, v3 :: v_dual_max_num_f32 v0, v0, v2 -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v2, v2, v0 :: v_dual_lshlrev_b32 v1, 16, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v1, v1, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v1, v1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v1, v5, v8, s4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v2, v1, 0x7060302 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6 ; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -8035,40 +8063,40 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 -; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_add_i32 s4, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[6:7], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX942-NEXT: s_movk_i32 s8, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX942-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX942-NEXT: s_mov_b32 s9, 0x7060302 ; GFX942-NEXT: v_mov_b32_e32 v4, s4 ; GFX942-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX942-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v3 -; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX942-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v1 +; GFX942-NEXT: v_max_f32_e32 v3, v3, v0 +; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v5, v5, v2, s8 +; GFX942-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] -; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX942-NEXT: v_perm_b32 v6, v3, v2, s9 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[6:7] +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v7 ; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX942-NEXT: s_cbranch_execnz .LBB20_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8078,10 +8106,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s16 ; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 @@ -8089,34 +8118,35 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-TRUE16-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v1 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_max_f32 v5, v5, v2 :: v_dual_max_f32 v0, v0, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_dual_max_f32 v2, v2, v3 :: v_dual_lshlrev_b32 v1, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v1, v1, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v1, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6 ; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -8129,10 +8159,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s16 ; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 @@ -8140,32 +8171,33 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-FAKE16-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_max_f32 v5, v5, v3 :: v_dual_max_f32 v0, v0, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_dual_max_f32 v2, v2, v0 :: v_dual_lshlrev_b32 v1, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v1, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, v5, v8, s4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v2, v1, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -8180,38 +8212,38 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 ; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, s4 ; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_max_f32_e32 v5, v5, v3 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v0 +; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v8, s4 +; GFX10-NEXT: v_perm_b32 v5, v2, v1, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v2, v6 +; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB20_1 @@ -8223,39 +8255,39 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v1 +; GFX90A-NEXT: v_max_f32_e32 v3, v3, v0 +; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v6, v3, v2, s9 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8269,37 +8301,37 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s4, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 ; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX908-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_max_f32_e32 v5, v5, v3 -; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 ; GFX908-NEXT: v_mov_b32_e32 v6, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX908-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v0 +; GFX908-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v5, v5, v1, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v2, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX908-NEXT: v_cndmask_b32_e64 v1, v5, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v5, v2, v1, s9 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v2, v6 +; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8313,38 +8345,38 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_max_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX8-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX8-NEXT: v_max_f32_e32 v2, v2, v0 +; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, v6 +; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index acb27be1846b9..f0e559aa6d730 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -211,24 +211,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 -; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_add_i32 s6, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX942-NEXT: v_mov_b32_e32 v1, s6 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX942-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v2, v0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -261,23 +261,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v2, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -291,21 +291,21 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f32_e32 v1, v5, v5 +; GFX908-NEXT: v_min_f32_e32 v4, v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -319,21 +319,21 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GFX8-NEXT: v_min_f32_e32 v4, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1396,7 +1396,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, s16 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 @@ -1405,17 +1405,18 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[9:10], v[9:10] +; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[2:3], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: v_dual_mov_b32 v4, v9 :: v_dual_mov_b32 v5, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[2:5], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1439,7 +1440,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, s16 -; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: s_add_i32 s4, s16, 0x800 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v6, s4 @@ -1448,18 +1449,19 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[9:10], v[9:10] +; GFX11-NEXT: v_min_f64 v[7:8], v[2:3], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX11-NEXT: v_dual_mov_b32 v4, v9 :: v_dual_mov_b32 v5, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[2:5], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1493,26 +1495,26 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, s20 ; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v10, v3 ; GFX908-NEXT: v_mov_b32_e32 v9, v2 -; GFX908-NEXT: v_mov_b32_e32 v8, v1 -; GFX908-NEXT: v_mov_b32_e32 v7, v0 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f64 v[2:3], v[9:10], v[9:10] +; GFX908-NEXT: v_min_f64 v[7:8], v[2:3], v[0:1] +; GFX908-NEXT: v_mov_b32_e32 v2, v7 +; GFX908-NEXT: v_mov_b32_e32 v3, v8 +; GFX908-NEXT: v_mov_b32_e32 v4, v9 +; GFX908-NEXT: v_mov_b32_e32 v5, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v2, v7 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v8 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1524,26 +1526,26 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s20 ; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v10, v3 ; GFX8-NEXT: v_mov_b32_e32 v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, v0 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_max_f64 v[2:3], v[9:10], v[9:10] +; GFX8-NEXT: v_min_f64 v[7:8], v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, v7 +; GFX8-NEXT: v_mov_b32_e32 v3, v8 +; GFX8-NEXT: v_mov_b32_e32 v4, v9 +; GFX8-NEXT: v_mov_b32_e32 v5, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v7 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v8 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2499,42 +2501,43 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, s4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen ; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2546,46 +2549,47 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v0, v0 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen ; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v0, v0, v5 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v1, v1, v0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2594,30 +2598,30 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_addk_i32 s16, 0x200 ; GFX942-NEXT: s_and_b32 s4, s16, -4 -; GFX942-NEXT: v_mov_b32_e32 v4, s4 -; GFX942-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen ; GFX942-NEXT: s_and_b32 s4, s16, 3 ; GFX942-NEXT: s_lshl_b32 s6, s4, 3 ; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX942-NEXT: s_not_b32 s7, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f16_e32 v5, v0, v0 +; GFX942-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX942-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX942-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX942-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v2, s6, v5 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_min_f16_e32 v2, v2, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, s6, v2 +; GFX942-NEXT: v_and_or_b32 v4, v5, s7, v2 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2632,125 +2636,127 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v0, v0 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 ; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen ; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-FAKE16-NEXT: v_min_f16_e32 v0, v0, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v1, v1, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_max_f16_e32 v5, v0, v0 +; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_and_b32 s4, s20, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX10-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v5 +; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX10-NEXT: v_min_f16_e32 v1, v1, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_mov_b32_e32 v2, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2758,29 +2764,29 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 ; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v5, v0, v0 +; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX90A-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v5 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_min_f16_e32 v2, v2, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2793,35 +2799,35 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 ; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v3, s4 +; GFX908-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v5, v0, v0 +; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX908-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX908-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX908-NEXT: v_min_f16_e32 v1, v1, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX908-NEXT: v_and_or_b32 v4, v5, s7, v1 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2829,36 +2835,36 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 ; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v5, v0, v0 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-NEXT: v_min_f16_e32 v1, v1, v0 +; GFX8-NEXT: v_and_b32_e32 v2, s7, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2866,38 +2872,38 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 ; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v3, s4 +; GFX7-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s7, v5 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX7-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: v_mov_b32_e32 v2, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -2906,39 +2912,39 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 ; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX6-NEXT: v_min_f32_e32 v0, v0, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s7, v5 +; GFX6-NEXT: v_min_f32_e32 v1, v1, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX6-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_mov_b32_e32 v2, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2966,29 +2972,30 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen ; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -3006,39 +3013,40 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v0, v0 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen ; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v0, v0, v3 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v1, v1, v0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -3053,30 +3061,30 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_addk_i32 s16, 0x200 ; GFX942-NEXT: s_and_b32 s4, s16, -4 -; GFX942-NEXT: v_mov_b32_e32 v2, s4 -; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen ; GFX942-NEXT: s_and_b32 s4, s16, 3 ; GFX942-NEXT: s_lshl_b32 s6, s4, 3 ; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX942-NEXT: s_not_b32 s7, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX942-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX942-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX942-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v2, s6, v5 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_min_f16_e32 v2, v2, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, s6, v2 +; GFX942-NEXT: v_and_or_b32 v4, v5, s7, v2 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3095,30 +3103,31 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -3131,38 +3140,39 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 ; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen ; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-FAKE16-NEXT: v_min_f16_e32 v0, v0, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v1, v1, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -3175,32 +3185,32 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_and_b32 s4, s20, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX10-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v5 +; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX10-NEXT: v_min_f16_e32 v1, v1, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_mov_b32_e32 v2, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB11_1 @@ -3213,29 +3223,29 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 ; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX90A-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v5 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_min_f16_e32 v2, v2, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3247,30 +3257,30 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 ; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v3, s4 +; GFX908-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX908-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX908-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX908-NEXT: v_min_f16_e32 v1, v1, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX908-NEXT: v_and_or_b32 v4, v5, s7, v1 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3282,31 +3292,31 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 ; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-NEXT: v_min_f16_e32 v1, v1, v0 +; GFX8-NEXT: v_and_b32_e32 v2, s7, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3318,33 +3328,33 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 ; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v3, s4 +; GFX7-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s7, v5 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX7-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: v_mov_b32_e32 v2, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3356,34 +3366,34 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 ; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX6-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s7, v5 +; GFX6-NEXT: v_min_f32_e32 v1, v1, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX6-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_mov_b32_e32 v2, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3436,18 +3446,21 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v8 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v4.h, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v4.h, v4.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v7, v8, v11, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3462,14 +3475,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[7:8], v10, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v10, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_4 ; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v7, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -3477,7 +3489,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_3 ; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v7 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v6 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3510,7 +3522,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-FAKE16-NEXT: buffer_load_b32 v4, v8, s[4:7], null offen ; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-FAKE16-NEXT: ; %bb.2: @@ -3521,17 +3533,19 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v4, v4, v10 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v4, v4, v10 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 @@ -3555,7 +3569,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -3588,7 +3601,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX942-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen +; GFX942-NEXT: buffer_load_dword v4, v9, s[4:7], 0 offen ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 ; GFX942-NEXT: ; %bb.2: @@ -3599,6 +3612,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: ; =>This Loop Header: Depth=1 ; GFX942-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v4 ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v8, v7 ; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX942-NEXT: v_min_f16_e32 v4, v4, v11 @@ -3627,7 +3641,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v7, v4 ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB12_3 @@ -3671,18 +3684,21 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v8 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v4.h, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v4.h, v4.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v7, v8, v11, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3696,14 +3712,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[7:8], v10, s[4:7], 0 offen glc +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v10, s[4:7], 0 offen glc ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_4 ; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v7, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -3712,7 +3727,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_3 ; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v6 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -3739,7 +3754,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen +; GFX11-FAKE16-NEXT: buffer_load_b32 v4, v8, s[4:7], 0 offen ; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX11-FAKE16-NEXT: ; %bb.2: @@ -3750,17 +3765,19 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX11-FAKE16-NEXT: v_min_f16_e32 v4, v4, v10 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f16_e32 v4, v4, v10 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 ; GFX11-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 @@ -3783,7 +3800,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -3815,7 +3831,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX10-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB12_1 @@ -3826,9 +3842,10 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX10-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX10-NEXT: v_min_f16_e32 v4, v4, v10 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -3854,7 +3871,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 @@ -3887,7 +3903,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen +; GFX90A-NEXT: buffer_load_dword v4, v9, s[8:11], 0 offen ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: @@ -3898,6 +3914,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v8, v7 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX90A-NEXT: v_min_f16_e32 v4, v4, v11 @@ -3924,7 +3941,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_3 @@ -3954,7 +3970,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX908-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 ; GFX908-NEXT: ; %bb.2: @@ -3965,6 +3981,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX908-NEXT: v_min_f16_e32 v4, v4, v10 @@ -3992,7 +4009,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB12_3 @@ -4022,7 +4038,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX8-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: @@ -4033,6 +4049,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX8-NEXT: v_min_f16_e32 v4, v4, v10 @@ -4061,7 +4078,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB12_3 @@ -4089,18 +4105,19 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v5 ; GFX7-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 @@ -4130,7 +4147,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB12_3 @@ -4159,18 +4175,20 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX6-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v5 ; GFX6-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB12_4 Depth 2 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 @@ -4200,7 +4218,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB12_3 @@ -4231,53 +4248,53 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, s4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen ; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v1, v1, v3 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v1, v1, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v2.l ; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4289,54 +4306,54 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen ; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v0, v0, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v1, v1, v0 +; GFX12-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4345,36 +4362,36 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_addk_i32 s16, 0x200 ; GFX942-NEXT: s_and_b32 s4, s16, -4 -; GFX942-NEXT: v_mov_b32_e32 v4, s4 -; GFX942-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen ; GFX942-NEXT: s_and_b32 s4, s16, 3 ; GFX942-NEXT: s_lshl_b32 s6, s4, 3 ; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX942-NEXT: s_not_b32 s7, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX942-NEXT: s_movk_i32 s8, 0x7fff ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_min_f32_e32 v0, v0, v5 -; GFX942-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX942-NEXT: v_add3_u32 v2, v2, v0, s8 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v0 +; GFX942-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v3, v3, v2, s8 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] -; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v5, s7, v2 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5] +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB13_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4388,146 +4405,146 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v2.l ; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3 -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 ; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen ; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v0, v0, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v1, v1, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_and_b32 s4, s20, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_f32_e32 v0, v0, v5 -; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_min_f32_e32 v1, v1, v0 +; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_mov_b32_e32 v2, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4535,34 +4552,34 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 ; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v5 -; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v0 +; GFX90A-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v3, v3, v2, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4575,40 +4592,40 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 ; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v3, s4 +; GFX908-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_min_f32_e32 v0, v0, v5 -; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v2, v2, v0, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_min_f32_e32 v1, v1, v0 +; GFX908-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX908-NEXT: v_add3_u32 v2, v2, v1, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX908-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v4, v5, s7, v1 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4616,42 +4633,42 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 ; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f32_e32 v3, v3, v5 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: v_lshrrev_b32_sdwa v1, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f32_e32 v1, v1, v0 +; GFX8-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v4, s7, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4659,39 +4676,39 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 ; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v3, s4 +; GFX7-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX7-NEXT: s_and_b32 s4, s20, 3 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s7, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX7-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: v_mov_b32_e32 v2, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -4700,40 +4717,40 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 ; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX6-NEXT: s_and_b32 s4, s20, 3 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_min_f32_e32 v0, v0, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_min_f32_e32 v1, v1, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s7, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX6-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_mov_b32_e32 v2, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -4760,40 +4777,40 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen ; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v1, v1, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v4.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v1, v1, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v2.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -4811,47 +4828,47 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen ; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX12-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v0, v0, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v1, v1, v0 +; GFX12-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -4866,36 +4883,36 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_addk_i32 s16, 0x200 ; GFX942-NEXT: s_and_b32 s4, s16, -4 -; GFX942-NEXT: v_mov_b32_e32 v2, s4 -; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen ; GFX942-NEXT: s_and_b32 s4, s16, 3 ; GFX942-NEXT: s_lshl_b32 s6, s4, 3 ; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX942-NEXT: s_not_b32 s7, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX942-NEXT: s_movk_i32 s8, 0x7fff ; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX942-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX942-NEXT: v_add3_u32 v4, v4, v0, s8 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v0 +; GFX942-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v3, v3, v2, s8 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v5, s7, v2 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5] +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB14_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4913,40 +4930,40 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v1, v1, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v2.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v4 -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -4959,45 +4976,45 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 ; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen ; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v1, v1, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -5010,36 +5027,36 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_and_b32 s4, s20, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_min_f32_e32 v1, v1, v0 +; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v4, v5, s6, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_mov_b32_e32 v2, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB14_1 @@ -5052,34 +5069,34 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 ; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v0 +; GFX90A-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v3, v3, v2, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5091,35 +5108,35 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 ; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v3, s4 +; GFX908-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v4, v4, v0, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_min_f32_e32 v1, v1, v0 +; GFX908-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX908-NEXT: v_add3_u32 v2, v2, v1, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX908-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v4, v5, s7, v1 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5131,37 +5148,37 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 ; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_not_b32 s7, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshrrev_b32_sdwa v1, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f32_e32 v1, v1, v0 +; GFX8-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v4, s7, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5173,34 +5190,34 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 ; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v3, s4 +; GFX7-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX7-NEXT: s_and_b32 s4, s20, 3 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s7, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX7-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: v_mov_b32_e32 v2, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5212,35 +5229,35 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 ; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GFX6-NEXT: s_and_b32 s4, s20, 3 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v1, s6, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_min_f32_e32 v1, v1, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s7, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX6-NEXT: v_or_b32_e32 v4, v2, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_mov_b32_e32 v2, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5264,11 +5281,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, -4, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v11, v7 +; GFX12-TRUE16-NEXT: v_not_b32_e32 v10, v7 ; GFX12-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -5282,7 +5299,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v9, s[4:7], null offen ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-TRUE16-NEXT: ; %bb.2: @@ -5292,28 +5309,31 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.h, v5.l +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v5.l ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v8 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v6, v6, v8 -; GFX12-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v6, v6, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v11, v6, 16, 1 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v11, v11, v6, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v11, v12, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v8.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v7.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX12-TRUE16-NEXT: v_and_or_b32 v7, v8, v10, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -5328,14 +5348,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_4 ; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -5343,7 +5362,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_3 ; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v6 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5376,7 +5395,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-FAKE16-NEXT: buffer_load_b32 v4, v8, s[4:7], null offen ; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-FAKE16-NEXT: ; %bb.2: @@ -5387,25 +5406,26 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, v4, v10 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX12-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 @@ -5429,7 +5449,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -5462,7 +5481,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX942-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen +; GFX942-NEXT: buffer_load_dword v4, v9, s[4:7], 0 offen ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB15_1 ; GFX942-NEXT: ; %bb.2: @@ -5474,6 +5493,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX942-NEXT: ; =>This Loop Header: Depth=1 ; GFX942-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, v4 ; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_mov_b64 s[8:9], exec ; GFX942-NEXT: v_min_f32_e32 v4, v4, v11 @@ -5507,7 +5527,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v7, v4 ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB15_3 @@ -5524,11 +5543,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, -4, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v11, v7 +; GFX11-TRUE16-NEXT: v_not_b32_e32 v10, v7 ; GFX11-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -5540,7 +5559,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v9, s[4:7], 0 offen ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 ; GFX11-TRUE16-NEXT: ; %bb.2: @@ -5551,27 +5570,30 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v5.l ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v8 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f32_e32 v6, v6, v8 -; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v6, v6, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v6, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v11, v12, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v7.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX11-TRUE16-NEXT: v_and_or_b32 v7, v8, v10, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -5585,14 +5607,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_4 ; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -5602,7 +5623,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v6 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -5629,7 +5650,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen +; GFX11-FAKE16-NEXT: buffer_load_b32 v4, v8, s[4:7], 0 offen ; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_1 ; GFX11-FAKE16-NEXT: ; %bb.2: @@ -5641,24 +5662,25 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, v4, v10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 ; GFX11-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 @@ -5681,7 +5703,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -5714,7 +5735,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX10-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB15_1 @@ -5725,9 +5746,10 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_min_f32_e32 v4, v4, v10 ; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v4 @@ -5757,7 +5779,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 @@ -5790,7 +5811,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen +; GFX90A-NEXT: buffer_load_dword v4, v9, s[8:11], 0 offen ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 ; GFX90A-NEXT: ; %bb.2: @@ -5802,6 +5823,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_min_f32_e32 v4, v4, v11 ; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 @@ -5832,7 +5854,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB15_3 @@ -5862,7 +5883,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX908-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: ; %bb.2: @@ -5874,6 +5895,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_min_f32_e32 v4, v4, v10 ; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 @@ -5905,7 +5927,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB15_3 @@ -5935,7 +5956,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX8-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB15_1 ; GFX8-NEXT: ; %bb.2: @@ -5946,6 +5967,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_f32_e32 v4, v4, v10 ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 @@ -5979,7 +6001,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB15_3 @@ -6007,18 +6028,19 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB15_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX7-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -6049,7 +6071,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB15_3 @@ -6078,18 +6099,20 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX6-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB15_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX6-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB15_4 Depth 2 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -6120,7 +6143,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB15_3 @@ -6470,7 +6492,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 -; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0 +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 @@ -6478,16 +6500,17 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1 +; GFX12-NEXT: v_mov_b32_e32 v5, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v2 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_pk_max_num_f16 v1, v5, v5 +; GFX12-NEXT: v_pk_min_num_f16 v4, v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -6501,25 +6524,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 -; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_add_i32 s6, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX942-NEXT: v_mov_b32_e32 v1, s6 ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_pk_max_f16 v2, v5, v5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX942-NEXT: v_pk_min_f16 v4, v2, v0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5] +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB17_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6531,24 +6554,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s16 ; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: v_pk_max_f16 v2, v0, v0 +; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX11-NEXT: v_mov_b32_e32 v5, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-NEXT: v_pk_max_f16 v1, v5, v5 +; GFX11-NEXT: v_pk_min_f16 v4, v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -6562,24 +6586,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 ; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_pk_max_f16 v2, v0, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_max_f16 v1, v5, v5 +; GFX10-NEXT: v_pk_min_f16 v4, v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_mov_b32_e32 v2, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 @@ -6591,23 +6615,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX90A-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_pk_max_f16 v2, v5, v5 +; GFX90A-NEXT: v_pk_min_f16 v4, v2, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6621,21 +6645,21 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v0, v0 +; GFX908-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX908-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_pk_max_f16 v1, v5, v5 +; GFX908-NEXT: v_pk_min_f16 v4, v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6649,25 +6673,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX8-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v1, v1 -; GFX8-NEXT: v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_max_f16_sdwa v1, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, v6, v6 +; GFX8-NEXT: v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v2, v2, v0 +; GFX8-NEXT: v_or_b32_e32 v5, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, v6 +; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7933,43 +7957,45 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-TRUE16-NEXT: s_add_co_i32 s4, s16, 0x400 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 ; GFX12-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX12-TRUE16-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v1 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_min_num_f32 v5, v5, v2 :: v_dual_min_num_f32 v0, v0, v3 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_dual_min_num_f32 v2, v2, v3 :: v_dual_lshlrev_b32 v1, 16, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v1, v1, v0 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v1, v2 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6 ; GFX12-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -7986,42 +8012,44 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-FAKE16-NEXT: s_add_co_i32 s4, s16, 0x400 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX12-FAKE16-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v5, v5, v3 :: v_dual_min_num_f32 v0, v0, v2 -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v2, v2, v0 :: v_dual_lshlrev_b32 v1, 16, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v1, v1, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v1, v1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v1, v5, v8, s4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v2, v1, 0x7060302 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6 ; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -8035,40 +8063,40 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 -; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_add_i32 s4, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[6:7], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX942-NEXT: s_movk_i32 s8, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX942-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX942-NEXT: s_mov_b32 s9, 0x7060302 ; GFX942-NEXT: v_mov_b32_e32 v4, s4 ; GFX942-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX942-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v3 -; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX942-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v1 +; GFX942-NEXT: v_min_f32_e32 v3, v3, v0 +; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v5, v5, v2, s8 +; GFX942-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] -; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX942-NEXT: v_perm_b32 v6, v3, v2, s9 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[6:7] +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v7 ; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX942-NEXT: s_cbranch_execnz .LBB20_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8078,10 +8106,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s16 ; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 @@ -8089,34 +8118,35 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-TRUE16-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v1 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_min_f32 v5, v5, v2 :: v_dual_min_f32 v0, v0, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_dual_min_f32 v2, v2, v3 :: v_dual_lshlrev_b32 v1, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v1, v1, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v1, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6 ; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -8129,10 +8159,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s16 ; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 @@ -8140,32 +8171,33 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-FAKE16-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_min_f32 v5, v5, v3 :: v_dual_min_f32 v0, v0, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_dual_min_f32 v2, v2, v0 :: v_dual_lshlrev_b32 v1, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v1, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, v5, v8, s4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v2, v1, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -8180,38 +8212,38 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 ; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, s4 ; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_min_f32_e32 v5, v5, v3 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX10-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX10-NEXT: v_min_f32_e32 v2, v2, v0 +; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v8, s4 +; GFX10-NEXT: v_perm_b32 v5, v2, v1, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v2, v6 +; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB20_1 @@ -8223,39 +8255,39 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v1 +; GFX90A-NEXT: v_min_f32_e32 v3, v3, v0 +; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v6, v3, v2, s9 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8269,37 +8301,37 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s4, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 ; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_min_f32_e32 v5, v5, v3 -; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 ; GFX908-NEXT: v_mov_b32_e32 v6, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX908-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v0 +; GFX908-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v5, v5, v1, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v2, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX908-NEXT: v_cndmask_b32_e64 v1, v5, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v5, v2, v1, s9 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v2, v6 +; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8313,38 +8345,38 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_min_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX8-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX8-NEXT: v_min_f32_e32 v2, v2, v0 +; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, v6 +; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index e6c38d29be949..26f9949085163 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -220,19 +220,22 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0: ; %bb.0: ; %_udiv-special-cases ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v21, v0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v2 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec @@ -243,31 +246,32 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v9 ; GFX9-O0-NEXT: v_mov_b32_e32 v19, v10 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-O0-NEXT: ; implicit-def: $vgpr30 : SGPR spill to VGPR lane -; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 0 -; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 1 +; GFX9-O0-NEXT: ; implicit-def: $vgpr29 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 0 +; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 1 ; GFX9-O0-NEXT: s_mov_b32 s10, s6 -; GFX9-O0-NEXT: v_writelane_b32 v30, s10, 2 +; GFX9-O0-NEXT: v_writelane_b32 v29, s10, 2 ; GFX9-O0-NEXT: s_mov_b32 s11, s7 -; GFX9-O0-NEXT: v_writelane_b32 v30, s11, 3 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v5, vcc, s10, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v0, v2, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v0, v13, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v19, vcc +; GFX9-O0-NEXT: v_writelane_b32 v29, s11, 3 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v5, vcc, s10, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v2, v1, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v2, v13, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v19, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec @@ -275,25 +279,25 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[9:10], s[4:5] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v19, v0, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v13, v0, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v19, v2, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v13, v2, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v20 @@ -424,18 +428,18 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6 ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[4:5], s[8:9] -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2 ; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s13 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v3 ; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr16 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0 ; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s13 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v1 ; GFX9-O0-NEXT: v_min_u32_e64 v11, v4, v10 ; GFX9-O0-NEXT: ; implicit-def: $sgpr13 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s12 @@ -515,35 +519,35 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[12:13] ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 -; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 ; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9] ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 -; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec -; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 4 -; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 5 +; GFX9-O0-NEXT: v_writelane_b32 v29, s4, 4 +; GFX9-O0-NEXT: v_writelane_b32 v29, s5, 5 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] @@ -551,67 +555,64 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_branch .LBB0_8 ; GFX9-O0-NEXT: .LBB0_1: ; %Flow ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 6 -; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7 +; GFX9-O0-NEXT: v_readlane_b32 s4, v29, 6 +; GFX9-O0-NEXT: v_readlane_b32 s5, v29, 7 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: ; %bb.2: ; %Flow -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_5 ; GFX9-O0-NEXT: .LBB0_3: ; %Flow2 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 4 -; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 5 +; GFX9-O0-NEXT: v_readlane_b32 s4, v29, 4 +; GFX9-O0-NEXT: v_readlane_b32 s5, v29, 5 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_9 ; GFX9-O0-NEXT: .LBB0_4: ; %udiv-loop-exit -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 1 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1] @@ -636,408 +637,408 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_3 ; GFX9-O0-NEXT: .LBB0_5: ; %Flow1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 8 -; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 9 +; GFX9-O0-NEXT: v_readlane_b32 s4, v29, 8 +; GFX9-O0-NEXT: v_readlane_b32 s5, v29, 9 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_4 ; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 10 -; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 11 -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_readlane_b32 s6, v29, 10 +; GFX9-O0-NEXT: v_readlane_b32 s7, v29, 11 +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: s_waitcnt vmcnt(16) -; GFX9-O0-NEXT: v_lshrrev_b64 v[28:29], s4, v[2:3] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v29 +; GFX9-O0-NEXT: s_waitcnt vmcnt(10) +; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], s4, v[2:3] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v7 ; GFX9-O0-NEXT: s_mov_b32 s5, 1 -; GFX9-O0-NEXT: v_lshlrev_b64 v[22:23], s5, v[22:23] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v23 -; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v28 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v22 -; GFX9-O0-NEXT: v_or_b32_e64 v22, v5, v10 -; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v23, v4 -; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[2:3] -; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], s4, v[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v29 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-O0-NEXT: v_lshlrev_b64 v[26:27], s5, v[26:27] +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v27 +; GFX9-O0-NEXT: v_or_b32_e64 v14, v14, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v26 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-O0-NEXT: v_lshlrev_b64 v[26:27], s5, v[2:3] +; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], s4, v[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v27 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v28 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_or_b32_e64 v4, v3, v4 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v26 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_or_b32_e64 v14, v3, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2 ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s5, v[0:1] -; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[6:7] +; GFX9-O0-NEXT: v_lshlrev_b64 v[26:27], s5, v[10:11] ; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v29 -; GFX9-O0-NEXT: s_waitcnt vmcnt(10) +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v27 -; GFX9-O0-NEXT: v_or3_b32 v6, v6, v7, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v28, v25 +; GFX9-O0-NEXT: v_or3_b32 v10, v10, v11, v28 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v28 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v26 -; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v26 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v24 +; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v11 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-O0-NEXT: s_waitcnt vmcnt(8) -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v25 -; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v17 +; GFX9-O0-NEXT: v_or_b32_e64 v10, v10, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v24 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v16 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v22 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v13, vcc, v13, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v10, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v4, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v7, vcc, v7, v10 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v16, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v14, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v11, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7 -; GFX9-O0-NEXT: v_ashrrev_i64 v[13:14], s4, v[11:12] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-O0-NEXT: v_ashrrev_i64 v[5:6], s4, v[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v6 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], 1 ; GFX9-O0-NEXT: s_mov_b32 s8, s5 -; GFX9-O0-NEXT: v_and_b32_e64 v12, v7, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v13 +; GFX9-O0-NEXT: v_and_b32_e64 v4, v15, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v5 ; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 -; GFX9-O0-NEXT: v_and_b32_e64 v14, v11, s4 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-O0-NEXT: v_and_b32_e64 v6, v17, s4 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v21 -; GFX9-O0-NEXT: v_and_b32_e64 v22, v7, v22 -; GFX9-O0-NEXT: v_and_b32_e64 v20, v11, v20 +; GFX9-O0-NEXT: v_and_b32_e64 v22, v15, v22 +; GFX9-O0-NEXT: v_and_b32_e64 v20, v17, v20 ; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v21, v22 ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v19 -; GFX9-O0-NEXT: v_and_b32_e64 v7, v7, v22 -; GFX9-O0-NEXT: v_and_b32_e64 v22, v11, v18 -; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v23, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v22 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v23 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v21 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v19 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v18, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v11, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v7, vcc +; GFX9-O0-NEXT: v_and_b32_e64 v15, v15, v22 +; GFX9-O0-NEXT: v_and_b32_e64 v17, v17, v18 +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v17 +; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 killed $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v21 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v10, vcc, v10, v19 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v16, vcc, v16, v18, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v14, vcc, v14, v17, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v15, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v11 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v16 +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 ; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 ; GFX9-O0-NEXT: s_mov_b32 s5, s8 ; GFX9-O0-NEXT: s_mov_b32 s4, s9 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, s5 -; GFX9-O0-NEXT: v_add_co_u32_e32 v19, vcc, v11, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s4 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v11, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v16, vcc, v10, v11, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v10, s4 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v10, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v11, vcc, v11, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v12, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v12, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v10, vcc, v10, v12, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v9 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v19 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v21, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v20 -; GFX9-O0-NEXT: v_or_b32_e64 v18, v18, v21 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v19 -; GFX9-O0-NEXT: v_or_b32_e64 v16, v16, v17 -; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v18 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[16:17], v[12:13] +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v8 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v12 +; GFX9-O0-NEXT: v_or_b32_e64 v10, v10, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11 +; GFX9-O0-NEXT: v_or_b32_e64 v8, v8, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[8:9], v[4:5] ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v2 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v0 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v14 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v12 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 6 -; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 7 +; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 6 +; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 7 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 10 -; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 11 +; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 10 +; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 11 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execnz .LBB0_6 ; GFX9-O0-NEXT: s_branch .LBB0_1 ; GFX9-O0-NEXT: .LBB0_7: ; %udiv-preheader ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(9) -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[18:19] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], v8, v[18:19] +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v15 ; GFX9-O0-NEXT: s_mov_b32 s4, 64 -; GFX9-O0-NEXT: v_sub_u32_e64 v20, s4, v4 -; GFX9-O0-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15] +; GFX9-O0-NEXT: v_sub_u32_e64 v20, s4, v8 +; GFX9-O0-NEXT: v_lshlrev_b64 v[20:21], v20, v[16:17] ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v21 -; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v22 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v20 -; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v7 -; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[6:7], v4, s4 -; GFX9-O0-NEXT: v_sub_u32_e64 v5, v4, s4 -; GFX9-O0-NEXT: v_lshrrev_b64 v[20:21], v5, v[14:15] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v21 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v22, s[6:7] +; GFX9-O0-NEXT: v_or_b32_e64 v11, v11, v22 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v20 +; GFX9-O0-NEXT: v_or_b32_e64 v14, v14, v15 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v15 +; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[6:7], v8, s4 +; GFX9-O0-NEXT: v_sub_u32_e64 v11, v8, s4 +; GFX9-O0-NEXT: v_lshrrev_b64 v[20:21], v11, v[16:17] +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v21 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v11, v11, v22, s[6:7] ; GFX9-O0-NEXT: s_mov_b32 s4, 0 -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, s4 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, s4 ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v19 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v22, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v20 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v18 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v11, v11, v22, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v20 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v18 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v4, v[14:15] -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v11 +; GFX9-O0-NEXT: v_lshrrev_b64 v[16:17], v8, v[16:17] +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v17 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-O0-NEXT: s_mov_b32 s8, s5 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 killed $vgpr16_vgpr17 killed $exec ; GFX9-O0-NEXT: s_mov_b32 s8, s4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v16, v11, v16, s[6:7] ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v13 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8 +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v10 ; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 ; GFX9-O0-NEXT: s_mov_b32 s7, s8 ; GFX9-O0-NEXT: s_mov_b32 s6, s9 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, s7 -; GFX9-O0-NEXT: v_add_co_u32_e32 v16, vcc, v15, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, s6 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v12, vcc, v12, v15, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v18, vcc, v14, v15, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v13, vcc, v13, v14, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s7 +; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v12, vcc, v11, v12, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s7 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v10, vcc, v10, v11, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v11, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v13 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, s5 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, s5 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, s4 -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 10 -; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 11 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, s5 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4 +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_writelane_b32 v29, s4, 10 +; GFX9-O0-NEXT: v_writelane_b32 v29, s5, 11 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_6 ; GFX9-O0-NEXT: .LBB0_8: ; %udiv-bb1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload @@ -1050,129 +1051,129 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 ; GFX9-O0-NEXT: s_mov_b32 s8, s6 ; GFX9-O0-NEXT: s_mov_b32 s9, s7 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v3, v4 +; GFX9-O0-NEXT: v_add_co_u32_e32 v7, vcc, v3, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v4, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9-O0-NEXT: v_addc_co_u32_e32 v4, vcc, v2, v4, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v2, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v8 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f ; GFX9-O0-NEXT: v_sub_u32_e64 v2, s4, v3 -; GFX9-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[10:11] -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 +; GFX9-O0-NEXT: v_lshlrev_b64 v[0:1], v2, v[11:12] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 ; GFX9-O0-NEXT: s_mov_b32 s4, 64 ; GFX9-O0-NEXT: v_sub_u32_e64 v13, s4, v2 -; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], v13, v[6:7] +; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], v13, v[9:10] ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14 -; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v15 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13 -; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v15 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 ; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v2, s4 ; GFX9-O0-NEXT: s_mov_b32 s10, 63 ; GFX9-O0-NEXT: v_sub_u32_e64 v3, s10, v3 -; GFX9-O0-NEXT: v_lshlrev_b64 v[12:13], v3, v[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[4:5] +; GFX9-O0-NEXT: v_lshlrev_b64 v[13:14], v3, v[9:10] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v14 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[4:5] ; GFX9-O0-NEXT: s_mov_b32 s10, 0 ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v2, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[10:11] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v12 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v13 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[10:11] ; GFX9-O0-NEXT: ; implicit-def: $sgpr10 ; GFX9-O0-NEXT: ; implicit-def: $sgpr10 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-O0-NEXT: v_lshlrev_b64 v[6:7], v2, v[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], v2, v[9:10] +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[4:5] -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s8 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v3, v6, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 -; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v8 -; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[0:1], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v7 +; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[4:5], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 8 -; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 9 +; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 8 +; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 9 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execz .LBB0_5 ; GFX9-O0-NEXT: s_branch .LBB0_7 ; GFX9-O0-NEXT: .LBB0_9: ; %udiv-end -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) @@ -1216,7 +1217,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4] ; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -1687,29 +1688,26 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_readlane_b32 s5, v31, 3 ; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-G-O0-NEXT: ; %bb.2: ; %Flow -; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) +; GFX9-G-O0-NEXT: s_nop 0 ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_branch .LBB0_5 ; GFX9-G-O0-NEXT: .LBB0_3: ; %Flow2 @@ -1795,14 +1793,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_readlane_b32 s4, v31, 4 ; GFX9-G-O0-NEXT: v_readlane_b32 s5, v31, 5 ; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) @@ -2245,20 +2243,20 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s5 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4 ; GFX9-G-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[4:5], v[6:7] -; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 -; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 -; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], exec ; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] @@ -2493,7 +2491,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0: ; %bb.0: ; %_udiv-special-cases ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill @@ -2550,16 +2548,16 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-O0-NEXT: ; implicit-def: $vgpr30 : SGPR spill to VGPR lane -; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 0 -; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 1 +; GFX9-O0-NEXT: ; implicit-def: $vgpr29 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 0 +; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 1 ; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[0:1], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 -; GFX9-O0-NEXT: v_or_b32_e64 v14, v3, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-O0-NEXT: v_or_b32_e64 v8, v2, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v9 +; GFX9-O0-NEXT: v_or_b32_e64 v14, v1, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-O0-NEXT: v_or_b32_e64 v8, v0, v2 ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v14 ; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7] @@ -2604,18 +2602,18 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2 ; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v3 ; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0 ; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v1 ; GFX9-O0-NEXT: v_min_u32_e64 v14, v4, v10 ; GFX9-O0-NEXT: ; implicit-def: $sgpr9 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 @@ -2699,35 +2697,35 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[12:13] ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 -; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 ; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9] ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 -; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec -; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 2 -; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 3 +; GFX9-O0-NEXT: v_writelane_b32 v29, s4, 2 +; GFX9-O0-NEXT: v_writelane_b32 v29, s5, 3 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] @@ -2735,50 +2733,47 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_branch .LBB1_8 ; GFX9-O0-NEXT: .LBB1_1: ; %Flow ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 4 -; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 5 +; GFX9-O0-NEXT: v_readlane_b32 s4, v29, 4 +; GFX9-O0-NEXT: v_readlane_b32 s5, v29, 5 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: ; %bb.2: ; %Flow -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) +; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) +; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) +; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_5 ; GFX9-O0-NEXT: .LBB1_3: ; %Flow2 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 2 -; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 3 +; GFX9-O0-NEXT: v_readlane_b32 s4, v29, 2 +; GFX9-O0-NEXT: v_readlane_b32 s5, v29, 3 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) @@ -2820,29 +2815,29 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_3 ; GFX9-O0-NEXT: .LBB1_5: ; %Flow1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 6 -; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7 +; GFX9-O0-NEXT: v_readlane_b32 s4, v29, 6 +; GFX9-O0-NEXT: v_readlane_b32 s5, v29, 7 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) @@ -2860,214 +2855,214 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: .LBB1_6: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 8 -; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 9 -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_readlane_b32 s6, v29, 8 +; GFX9-O0-NEXT: v_readlane_b32 s7, v29, 9 +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: s_waitcnt vmcnt(16) -; GFX9-O0-NEXT: v_lshrrev_b64 v[28:29], s4, v[2:3] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v29 +; GFX9-O0-NEXT: s_waitcnt vmcnt(10) +; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], s4, v[2:3] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v7 ; GFX9-O0-NEXT: s_mov_b32 s5, 1 -; GFX9-O0-NEXT: v_lshlrev_b64 v[22:23], s5, v[22:23] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v23 -; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v28 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v22 -; GFX9-O0-NEXT: v_or_b32_e64 v22, v5, v10 -; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v23, v4 -; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[2:3] -; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], s4, v[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v29 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-O0-NEXT: v_lshlrev_b64 v[26:27], s5, v[26:27] +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v27 +; GFX9-O0-NEXT: v_or_b32_e64 v14, v14, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v26 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-O0-NEXT: v_lshlrev_b64 v[26:27], s5, v[2:3] +; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], s4, v[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v27 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v28 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_or_b32_e64 v4, v3, v4 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v26 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_or_b32_e64 v14, v3, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2 ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s5, v[0:1] -; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[6:7] +; GFX9-O0-NEXT: v_lshlrev_b64 v[26:27], s5, v[10:11] ; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v29 -; GFX9-O0-NEXT: s_waitcnt vmcnt(10) +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v27 -; GFX9-O0-NEXT: v_or3_b32 v6, v6, v7, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v28, v25 +; GFX9-O0-NEXT: v_or3_b32 v10, v10, v11, v28 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v28 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v26 -; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v26 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v24 +; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v11 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-O0-NEXT: s_waitcnt vmcnt(8) -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v25 -; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v17 +; GFX9-O0-NEXT: v_or_b32_e64 v10, v10, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v24 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v16 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v22 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v13, vcc, v13, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v10, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v4, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v7, vcc, v7, v10 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v16, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v14, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v11, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7 -; GFX9-O0-NEXT: v_ashrrev_i64 v[13:14], s4, v[11:12] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-O0-NEXT: v_ashrrev_i64 v[5:6], s4, v[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v6 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], 1 ; GFX9-O0-NEXT: s_mov_b32 s8, s5 -; GFX9-O0-NEXT: v_and_b32_e64 v12, v7, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v13 +; GFX9-O0-NEXT: v_and_b32_e64 v4, v15, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v5 ; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 -; GFX9-O0-NEXT: v_and_b32_e64 v14, v11, s4 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-O0-NEXT: v_and_b32_e64 v6, v17, s4 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v21 -; GFX9-O0-NEXT: v_and_b32_e64 v22, v7, v22 -; GFX9-O0-NEXT: v_and_b32_e64 v20, v11, v20 +; GFX9-O0-NEXT: v_and_b32_e64 v22, v15, v22 +; GFX9-O0-NEXT: v_and_b32_e64 v20, v17, v20 ; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v21, v22 ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v19 -; GFX9-O0-NEXT: v_and_b32_e64 v7, v7, v22 -; GFX9-O0-NEXT: v_and_b32_e64 v22, v11, v18 -; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v23, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v22 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v23 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v21 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v19 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v18, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v11, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v7, vcc +; GFX9-O0-NEXT: v_and_b32_e64 v15, v15, v22 +; GFX9-O0-NEXT: v_and_b32_e64 v17, v17, v18 +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v17 +; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 killed $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v21 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v10, vcc, v10, v19 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v16, vcc, v16, v18, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v14, vcc, v14, v17, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v15, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v11 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v16 +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 ; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 ; GFX9-O0-NEXT: s_mov_b32 s5, s8 ; GFX9-O0-NEXT: s_mov_b32 s4, s9 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, s5 -; GFX9-O0-NEXT: v_add_co_u32_e32 v19, vcc, v11, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s4 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v11, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v16, vcc, v10, v11, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v10, s4 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v10, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v11, vcc, v11, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v12, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v12, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v10, vcc, v10, v12, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v9 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v19 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v21, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v20 -; GFX9-O0-NEXT: v_or_b32_e64 v18, v18, v21 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v19 -; GFX9-O0-NEXT: v_or_b32_e64 v16, v16, v17 -; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v18 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[16:17], v[12:13] +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v8 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v12 +; GFX9-O0-NEXT: v_or_b32_e64 v10, v10, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11 +; GFX9-O0-NEXT: v_or_b32_e64 v8, v8, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[8:9], v[4:5] ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v2 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v0 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v14 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v12 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 4 -; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 5 +; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 4 +; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 5 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 8 -; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 9 +; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 8 +; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 9 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill @@ -3085,128 +3080,128 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_branch .LBB1_1 ; GFX9-O0-NEXT: .LBB1_7: ; %udiv-preheader ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(9) -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[18:19] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], v8, v[18:19] +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v15 ; GFX9-O0-NEXT: s_mov_b32 s4, 64 -; GFX9-O0-NEXT: v_sub_u32_e64 v20, s4, v4 -; GFX9-O0-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15] +; GFX9-O0-NEXT: v_sub_u32_e64 v20, s4, v8 +; GFX9-O0-NEXT: v_lshlrev_b64 v[20:21], v20, v[16:17] ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v21 -; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v22 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v20 -; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v7 -; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[6:7], v4, s4 -; GFX9-O0-NEXT: v_sub_u32_e64 v5, v4, s4 -; GFX9-O0-NEXT: v_lshrrev_b64 v[20:21], v5, v[14:15] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v21 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v22, s[6:7] +; GFX9-O0-NEXT: v_or_b32_e64 v11, v11, v22 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v20 +; GFX9-O0-NEXT: v_or_b32_e64 v14, v14, v15 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v15 +; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[6:7], v8, s4 +; GFX9-O0-NEXT: v_sub_u32_e64 v11, v8, s4 +; GFX9-O0-NEXT: v_lshrrev_b64 v[20:21], v11, v[16:17] +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v21 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v11, v11, v22, s[6:7] ; GFX9-O0-NEXT: s_mov_b32 s4, 0 -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, s4 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, s4 ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v19 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v22, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v20 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v18 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v11, v11, v22, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v20 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v18 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v4, v[14:15] -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v11 +; GFX9-O0-NEXT: v_lshrrev_b64 v[16:17], v8, v[16:17] +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v17 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-O0-NEXT: s_mov_b32 s8, s5 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 killed $vgpr16_vgpr17 killed $exec ; GFX9-O0-NEXT: s_mov_b32 s8, s4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v16, v11, v16, s[6:7] ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v13 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8 +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v10 ; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 ; GFX9-O0-NEXT: s_mov_b32 s7, s8 ; GFX9-O0-NEXT: s_mov_b32 s6, s9 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, s7 -; GFX9-O0-NEXT: v_add_co_u32_e32 v16, vcc, v15, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, s6 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v12, vcc, v12, v15, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v18, vcc, v14, v15, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v13, vcc, v13, v14, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s7 +; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v12, vcc, v11, v12, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s7 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v10, vcc, v10, v11, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v11, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v13 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, s5 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, s5 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, s4 -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 8 -; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 9 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, s5 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4 +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_writelane_b32 v29, s4, 8 +; GFX9-O0-NEXT: v_writelane_b32 v29, s5, 9 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill @@ -3216,12 +3211,12 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_branch .LBB1_6 ; GFX9-O0-NEXT: .LBB1_8: ; %udiv-bb1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload @@ -3234,118 +3229,118 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 ; GFX9-O0-NEXT: s_mov_b32 s8, s6 ; GFX9-O0-NEXT: s_mov_b32 s9, s7 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v3, v4 +; GFX9-O0-NEXT: v_add_co_u32_e32 v7, vcc, v3, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v4, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9-O0-NEXT: v_addc_co_u32_e32 v4, vcc, v2, v4, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v2, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v8 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f ; GFX9-O0-NEXT: v_sub_u32_e64 v2, s4, v3 -; GFX9-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[10:11] -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 +; GFX9-O0-NEXT: v_lshlrev_b64 v[0:1], v2, v[11:12] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 ; GFX9-O0-NEXT: s_mov_b32 s4, 64 ; GFX9-O0-NEXT: v_sub_u32_e64 v13, s4, v2 -; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], v13, v[6:7] +; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], v13, v[9:10] ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14 -; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v15 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13 -; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v15 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 ; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v2, s4 ; GFX9-O0-NEXT: s_mov_b32 s10, 63 ; GFX9-O0-NEXT: v_sub_u32_e64 v3, s10, v3 -; GFX9-O0-NEXT: v_lshlrev_b64 v[12:13], v3, v[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[4:5] +; GFX9-O0-NEXT: v_lshlrev_b64 v[13:14], v3, v[9:10] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v14 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[4:5] ; GFX9-O0-NEXT: s_mov_b32 s10, 0 ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v2, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[10:11] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v12 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v13 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[10:11] ; GFX9-O0-NEXT: ; implicit-def: $sgpr10 ; GFX9-O0-NEXT: ; implicit-def: $sgpr10 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-O0-NEXT: v_lshlrev_b64 v[6:7], v2, v[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], v2, v[9:10] +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[4:5] -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s8 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v3, v6, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 -; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v8 -; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[0:1], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v7 +; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[4:5], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 6 -; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 7 +; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 6 +; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execz .LBB1_5 @@ -3364,7 +3359,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -3774,29 +3769,26 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_readlane_b32 s5, v32, 3 ; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-G-O0-NEXT: ; %bb.2: ; %Flow -; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) +; GFX9-G-O0-NEXT: s_nop 0 ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_branch .LBB1_5 ; GFX9-G-O0-NEXT: .LBB1_3: ; %Flow2 @@ -3882,14 +3874,14 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_readlane_b32 s4, v32, 4 ; GFX9-G-O0-NEXT: v_readlane_b32 s5, v32, 5 ; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) @@ -4345,20 +4337,20 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s5 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4 ; GFX9-G-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[4:5], v[6:7] -; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 -; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 -; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], exec ; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll index 7ea98a16e3b84..7e49a86798e73 100644 --- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll @@ -7,37 +7,37 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 -; SDAG-NEXT: v_mov_b32_e32 v18, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: v_ashrrev_i32_e32 v24, 31, v3 ; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v11 ; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f ; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc ; SDAG-NEXT: v_mov_b32_e32 v26, v24 ; SDAG-NEXT: v_mov_b32_e32 v27, v25 -; SDAG-NEXT: v_subb_u32_e32 v19, vcc, 0, v2, vcc +; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v2, vcc ; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] -; SDAG-NEXT: v_cndmask_b32_e64 v21, v1, v17, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v20, v0, v16, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v19, v1, v17, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v0, v16, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v3, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v16, v2, v19, s[4:5] -; SDAG-NEXT: v_ffbh_u32_e32 v1, v20 -; SDAG-NEXT: v_ffbh_u32_e32 v2, v21 +; SDAG-NEXT: v_cndmask_b32_e64 v16, v2, v21, s[4:5] +; SDAG-NEXT: v_ffbh_u32_e32 v1, v18 +; SDAG-NEXT: v_ffbh_u32_e32 v2, v19 ; SDAG-NEXT: v_cndmask_b32_e64 v17, v3, v0, s[4:5] -; SDAG-NEXT: v_or_b32_e32 v0, v20, v16 +; SDAG-NEXT: v_or_b32_e32 v0, v18, v16 ; SDAG-NEXT: v_sub_i32_e32 v3, vcc, 0, v8 -; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 32, v1 +; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], 32, v1 ; SDAG-NEXT: v_ffbh_u32_e32 v22, v16 -; SDAG-NEXT: v_or_b32_e32 v1, v21, v17 +; SDAG-NEXT: v_or_b32_e32 v1, v19, v17 ; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc -; SDAG-NEXT: v_min_u32_e32 v2, v19, v2 -; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 32, v22 +; SDAG-NEXT: v_min_u32_e32 v2, v21, v2 +; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], 32, v22 ; SDAG-NEXT: v_ffbh_u32_e32 v22, v17 ; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[10:11] ; SDAG-NEXT: v_cndmask_b32_e64 v28, v9, v23, s[6:7] ; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v10, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v29, v8, v3, s[6:7] -; SDAG-NEXT: v_min_u32_e32 v1, v19, v22 +; SDAG-NEXT: v_min_u32_e32 v1, v21, v22 ; SDAG-NEXT: v_add_i32_e64 v2, s[8:9], 64, v2 ; SDAG-NEXT: v_addc_u32_e64 v3, s[8:9], 0, 0, s[8:9] ; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v11, vcc @@ -46,17 +46,17 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cndmask_b32_e64 v9, v3, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v10, v2, v1, vcc ; SDAG-NEXT: v_ffbh_u32_e32 v3, v29 -; SDAG-NEXT: v_ffbh_u32_e32 v19, v28 +; SDAG-NEXT: v_ffbh_u32_e32 v21, v28 ; SDAG-NEXT: v_cndmask_b32_e64 v1, v11, v8, s[6:7] ; SDAG-NEXT: v_or_b32_e32 v2, v29, v0 ; SDAG-NEXT: v_add_i32_e32 v8, vcc, 32, v3 ; SDAG-NEXT: v_ffbh_u32_e32 v11, v0 ; SDAG-NEXT: v_or_b32_e32 v3, v28, v1 -; SDAG-NEXT: v_min_u32_e32 v8, v8, v19 +; SDAG-NEXT: v_min_u32_e32 v8, v8, v21 ; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v11 -; SDAG-NEXT: v_ffbh_u32_e32 v19, v1 +; SDAG-NEXT: v_ffbh_u32_e32 v21, v1 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; SDAG-NEXT: v_min_u32_e32 v2, v11, v19 +; SDAG-NEXT: v_min_u32_e32 v2, v11, v21 ; SDAG-NEXT: v_add_i32_e64 v3, s[6:7], 64, v8 ; SDAG-NEXT: v_addc_u32_e64 v8, s[6:7], 0, 0, s[6:7] ; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[0:1] @@ -66,35 +66,35 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 ; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v8, v9, vcc ; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v2 -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v18, vcc +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v20, vcc ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[2:3] -; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v18, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v20, vcc ; SDAG-NEXT: v_or_b32_e32 v8, v8, v10 ; SDAG-NEXT: v_or_b32_e32 v9, v3, v11 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] -; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] ; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v18, v19, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v20, v21, s[4:5] ; SDAG-NEXT: v_and_b32_e32 v8, 1, v8 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v18, v17, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v17, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v22, v16, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v19, v21, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v21, v19, 0, s[4:5] ; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc -; SDAG-NEXT: v_cndmask_b32_e64 v23, v20, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v23, v18, 0, s[4:5] ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] ; SDAG-NEXT: s_cbranch_execz .LBB0_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 ; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v2 -; SDAG-NEXT: v_sub_i32_e64 v18, s[4:5], 63, v2 +; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v2 ; SDAG-NEXT: v_mov_b32_e32 v8, 0 ; SDAG-NEXT: v_mov_b32_e32 v9, 0 ; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v3, vcc -; SDAG-NEXT: v_lshl_b64 v[18:19], v[20:21], v18 +; SDAG-NEXT: v_lshl_b64 v[20:21], v[18:19], v20 ; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v10, vcc ; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc ; SDAG-NEXT: v_or_b32_e32 v10, v30, v32 @@ -102,16 +102,16 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v11, v31, v33 ; SDAG-NEXT: v_lshl_b64 v[2:3], v[16:17], v34 ; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v34 -; SDAG-NEXT: v_lshl_b64 v[22:23], v[20:21], v34 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[18:19], v34 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] -; SDAG-NEXT: v_lshr_b64 v[10:11], v[20:21], v35 +; SDAG-NEXT: v_lshr_b64 v[10:11], v[18:19], v35 ; SDAG-NEXT: v_or_b32_e32 v3, v3, v11 ; SDAG-NEXT: v_or_b32_e32 v2, v2, v10 ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34 -; SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, v23, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, v22, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v21, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, v20, v2, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v23, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v22, s[4:5] ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v34 ; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v17, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v16, s[4:5] @@ -121,7 +121,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB0_5 ; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 -; SDAG-NEXT: v_lshr_b64 v[8:9], v[20:21], v30 +; SDAG-NEXT: v_lshr_b64 v[8:9], v[18:19], v30 ; SDAG-NEXT: v_sub_i32_e32 v10, vcc, 64, v30 ; SDAG-NEXT: v_lshl_b64 v[10:11], v[16:17], v10 ; SDAG-NEXT: v_or_b32_e32 v11, v9, v11 @@ -131,9 +131,9 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_lshr_b64 v[8:9], v[16:17], v8 ; SDAG-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v21, v9, v21, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v19, v9, v19, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v20, v8, v20, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v8, v18, s[4:5] ; SDAG-NEXT: v_lshr_b64 v[8:9], v[16:17], v30 ; SDAG-NEXT: v_cndmask_b32_e32 v23, 0, v9, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v22, 0, v8, vcc @@ -149,30 +149,30 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_mov_b32_e32 v9, 0 ; SDAG-NEXT: .LBB0_3: ; %udiv-do-while3 ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v19 -; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 -; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v21 +; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v21 ; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v19 +; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v3 ; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; SDAG-NEXT: v_or_b32_e32 v19, v17, v19 -; SDAG-NEXT: v_or_b32_e32 v18, v16, v18 +; SDAG-NEXT: v_or_b32_e32 v21, v17, v21 +; SDAG-NEXT: v_or_b32_e32 v20, v16, v20 ; SDAG-NEXT: v_or_b32_e32 v16, v22, v38 -; SDAG-NEXT: v_or_b32_e32 v17, v20, v39 +; SDAG-NEXT: v_or_b32_e32 v17, v18, v39 ; SDAG-NEXT: v_or_b32_e32 v2, v2, v8 ; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v34, v17 -; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v35, v21, vcc +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v35, v19, vcc ; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v36, v16, vcc ; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v37, v23, vcc ; SDAG-NEXT: v_ashrrev_i32_e32 v8, 31, v8 -; SDAG-NEXT: v_and_b32_e32 v20, v8, v29 +; SDAG-NEXT: v_and_b32_e32 v18, v8, v29 ; SDAG-NEXT: v_and_b32_e32 v22, v8, v28 ; SDAG-NEXT: v_and_b32_e32 v38, v8, v0 ; SDAG-NEXT: v_and_b32_e32 v39, v8, v1 ; SDAG-NEXT: v_and_b32_e32 v8, 1, v8 -; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v17, v20 -; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v21, v22, vcc +; SDAG-NEXT: v_sub_i32_e32 v18, vcc, v17, v18 +; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v19, v22, vcc ; SDAG-NEXT: v_subb_u32_e32 v22, vcc, v16, v38, vcc ; SDAG-NEXT: v_subb_u32_e32 v23, vcc, v23, v39, vcc ; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30 @@ -194,11 +194,11 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: .LBB0_5: ; %Flow14 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v19 -; SDAG-NEXT: v_lshl_b64 v[2:3], v[18:19], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v21 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 ; SDAG-NEXT: v_or_b32_e32 v0, v0, v16 -; SDAG-NEXT: v_or_b32_e32 v18, v11, v1 -; SDAG-NEXT: v_or_b32_e32 v19, v9, v3 +; SDAG-NEXT: v_or_b32_e32 v20, v11, v1 +; SDAG-NEXT: v_or_b32_e32 v21, v9, v3 ; SDAG-NEXT: v_or_b32_e32 v22, v10, v0 ; SDAG-NEXT: v_or_b32_e32 v23, v8, v2 ; SDAG-NEXT: .LBB0_6: ; %Flow16 @@ -208,110 +208,110 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_sub_i32_e32 v0, vcc, 0, v4 ; SDAG-NEXT: v_mov_b32_e32 v8, 0 ; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f -; SDAG-NEXT: v_mov_b32_e32 v20, v16 -; SDAG-NEXT: v_mov_b32_e32 v21, v17 +; SDAG-NEXT: v_mov_b32_e32 v18, v16 +; SDAG-NEXT: v_mov_b32_e32 v19, v17 ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, 0, v5, vcc ; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v6, vcc ; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v3, v5, v1, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v0, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v7, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v4, v6, v9, s[4:5] ; SDAG-NEXT: v_ffbh_u32_e32 v1, v2 -; SDAG-NEXT: v_ffbh_u32_e32 v4, v3 -; SDAG-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[4:5] -; SDAG-NEXT: v_sub_i32_e32 v5, vcc, 0, v12 -; SDAG-NEXT: v_or_b32_e32 v0, v2, v6 -; SDAG-NEXT: v_ffbh_u32_e32 v9, v6 +; SDAG-NEXT: v_ffbh_u32_e32 v6, v3 +; SDAG-NEXT: v_cndmask_b32_e64 v5, v7, v0, s[4:5] +; SDAG-NEXT: v_sub_i32_e32 v7, vcc, 0, v12 +; SDAG-NEXT: v_or_b32_e32 v0, v2, v4 +; SDAG-NEXT: v_ffbh_u32_e32 v9, v4 ; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], 32, v1 ; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v13, vcc -; SDAG-NEXT: v_or_b32_e32 v1, v3, v7 +; SDAG-NEXT: v_or_b32_e32 v1, v3, v5 ; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], 32, v9 -; SDAG-NEXT: v_ffbh_u32_e32 v30, v7 -; SDAG-NEXT: v_min_u32_e32 v4, v10, v4 +; SDAG-NEXT: v_ffbh_u32_e32 v30, v5 +; SDAG-NEXT: v_min_u32_e32 v6, v10, v6 ; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v14, vcc ; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[14:15] ; SDAG-NEXT: v_cndmask_b32_e64 v28, v13, v11, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v29, v12, v5, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v29, v12, v7, s[4:5] ; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[0:1] ; SDAG-NEXT: v_min_u32_e32 v1, v9, v30 -; SDAG-NEXT: v_add_i32_e64 v4, s[8:9], 64, v4 -; SDAG-NEXT: v_addc_u32_e64 v5, s[8:9], 0, 0, s[8:9] +; SDAG-NEXT: v_add_i32_e64 v6, s[8:9], 64, v6 +; SDAG-NEXT: v_addc_u32_e64 v7, s[8:9], 0, 0, s[8:9] ; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v15, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v0, v14, v10, s[4:5] ; SDAG-NEXT: v_ffbh_u32_e32 v10, v29 ; SDAG-NEXT: v_ffbh_u32_e32 v11, v28 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v12, v5, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v13, v4, v1, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v12, v7, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v13, v6, v1, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v1, v15, v9, s[4:5] -; SDAG-NEXT: v_or_b32_e32 v4, v29, v0 +; SDAG-NEXT: v_or_b32_e32 v6, v29, v0 ; SDAG-NEXT: v_ffbh_u32_e32 v9, v0 ; SDAG-NEXT: v_add_i32_e32 v10, vcc, 32, v10 -; SDAG-NEXT: v_or_b32_e32 v5, v28, v1 +; SDAG-NEXT: v_or_b32_e32 v7, v28, v1 ; SDAG-NEXT: v_add_i32_e32 v9, vcc, 32, v9 ; SDAG-NEXT: v_ffbh_u32_e32 v14, v1 ; SDAG-NEXT: v_min_u32_e32 v10, v10, v11 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] -; SDAG-NEXT: v_min_u32_e32 v4, v9, v14 -; SDAG-NEXT: v_add_i32_e64 v5, s[4:5], 64, v10 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; SDAG-NEXT: v_min_u32_e32 v6, v9, v14 +; SDAG-NEXT: v_add_i32_e64 v7, s[4:5], 64, v10 ; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], 0, 0, s[4:5] ; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v13 -; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v9, v12, vcc -; SDAG-NEXT: v_xor_b32_e32 v9, 0x7f, v4 +; SDAG-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc +; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v6, v13 +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v9, v12, vcc +; SDAG-NEXT: v_xor_b32_e32 v9, 0x7f, v6 ; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v8, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[4:5] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v8, vcc ; SDAG-NEXT: v_or_b32_e32 v8, v9, v10 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] ; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v9, v5, v11 +; SDAG-NEXT: v_or_b32_e32 v9, v7, v11 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] ; SDAG-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] ; SDAG-NEXT: v_and_b32_e32 v8, 1, v12 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v13, v7, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v13, v5, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; SDAG-NEXT: v_cndmask_b32_e64 v9, v6, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v4, 0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v14, v3, 0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v8, v2, 0, s[4:5] ; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB0_12 ; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 -; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v4 -; SDAG-NEXT: v_sub_i32_e64 v12, s[4:5], 63, v4 +; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v6 +; SDAG-NEXT: v_sub_i32_e64 v12, s[4:5], 63, v6 ; SDAG-NEXT: v_mov_b32_e32 v8, 0 ; SDAG-NEXT: v_mov_b32_e32 v9, 0 -; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v5, vcc +; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v7, vcc ; SDAG-NEXT: v_lshl_b64 v[12:13], v[2:3], v12 ; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v10, vcc ; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc ; SDAG-NEXT: v_or_b32_e32 v10, v30, v32 -; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v4 +; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v6 ; SDAG-NEXT: v_or_b32_e32 v11, v31, v33 -; SDAG-NEXT: v_lshl_b64 v[4:5], v[6:7], v34 +; SDAG-NEXT: v_lshl_b64 v[6:7], v[4:5], v34 ; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v34 ; SDAG-NEXT: v_lshl_b64 v[14:15], v[2:3], v34 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] ; SDAG-NEXT: v_lshr_b64 v[10:11], v[2:3], v35 -; SDAG-NEXT: v_or_b32_e32 v5, v5, v11 -; SDAG-NEXT: v_or_b32_e32 v4, v4, v10 +; SDAG-NEXT: v_or_b32_e32 v7, v7, v11 +; SDAG-NEXT: v_or_b32_e32 v6, v6, v10 ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34 -; SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v6, v12, v6, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v15, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v14, s[4:5] ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v34 -; SDAG-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v6, v6, v4, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v12, 0 ; SDAG-NEXT: v_mov_b32_e32 v13, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -321,24 +321,24 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_lshr_b64 v[8:9], v[2:3], v30 ; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v30 ; SDAG-NEXT: v_subrev_i32_e32 v36, vcc, 64, v30 -; SDAG-NEXT: v_lshr_b64 v[37:38], v[6:7], v30 +; SDAG-NEXT: v_lshr_b64 v[37:38], v[4:5], v30 ; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v29 ; SDAG-NEXT: v_mov_b32_e32 v12, 0 ; SDAG-NEXT: v_mov_b32_e32 v13, 0 ; SDAG-NEXT: v_mov_b32_e32 v14, 0 ; SDAG-NEXT: v_mov_b32_e32 v15, 0 ; SDAG-NEXT: s_mov_b64 s[10:11], 0 -; SDAG-NEXT: v_lshl_b64 v[48:49], v[6:7], v35 -; SDAG-NEXT: v_lshr_b64 v[6:7], v[6:7], v36 +; SDAG-NEXT: v_lshl_b64 v[48:49], v[4:5], v35 +; SDAG-NEXT: v_lshr_b64 v[4:5], v[4:5], v36 ; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v28, vcc ; SDAG-NEXT: v_or_b32_e32 v9, v9, v49 ; SDAG-NEXT: v_or_b32_e32 v8, v8, v48 ; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v0, vcc ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v9, v7, v9, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v6, v8, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v38, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v37, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v5, v9, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v4, v8, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v38, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v37, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v1, vcc ; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 ; SDAG-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc @@ -346,23 +346,23 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_mov_b32_e32 v9, 0 ; SDAG-NEXT: .LBB0_9: ; %udiv-do-while ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; SDAG-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; SDAG-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v3 ; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v5 -; SDAG-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v7 +; SDAG-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v11 ; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 -; SDAG-NEXT: v_or_b32_e32 v6, v6, v8 +; SDAG-NEXT: v_or_b32_e32 v4, v4, v8 ; SDAG-NEXT: v_or_b32_e32 v2, v2, v38 -; SDAG-NEXT: v_or_b32_e32 v4, v4, v39 -; SDAG-NEXT: v_or_b32_e32 v5, v13, v5 +; SDAG-NEXT: v_or_b32_e32 v6, v6, v39 +; SDAG-NEXT: v_or_b32_e32 v7, v13, v7 ; SDAG-NEXT: v_or_b32_e32 v11, v15, v11 ; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v34, v2 -; SDAG-NEXT: v_or_b32_e32 v4, v12, v4 +; SDAG-NEXT: v_or_b32_e32 v6, v12, v6 ; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v35, v3, vcc -; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v36, v6, vcc -; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v37, v7, vcc +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v36, v4, vcc +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v37, v5, vcc ; SDAG-NEXT: v_ashrrev_i32_e32 v8, 31, v8 ; SDAG-NEXT: v_and_b32_e32 v15, v8, v29 ; SDAG-NEXT: v_and_b32_e32 v38, v8, v28 @@ -370,8 +370,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_and_b32_e32 v48, v8, v1 ; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v15 ; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v38, vcc -; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v39, vcc -; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v48, vcc +; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v4, v39, vcc +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v48, vcc ; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30 ; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc ; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc @@ -390,7 +390,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB0_11: ; %Flow11 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 +; SDAG-NEXT: v_lshl_b64 v[0:1], v[6:7], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v11 ; SDAG-NEXT: v_lshl_b64 v[2:3], v[10:11], 1 ; SDAG-NEXT: v_or_b32_e32 v0, v0, v4 @@ -402,11 +402,11 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_xor_b32_e32 v3, v27, v26 ; SDAG-NEXT: v_xor_b32_e32 v2, v25, v24 -; SDAG-NEXT: v_xor_b32_e32 v7, v21, v20 +; SDAG-NEXT: v_xor_b32_e32 v7, v19, v18 ; SDAG-NEXT: v_xor_b32_e32 v6, v17, v16 -; SDAG-NEXT: v_xor_b32_e32 v4, v18, v3 +; SDAG-NEXT: v_xor_b32_e32 v4, v20, v3 ; SDAG-NEXT: v_xor_b32_e32 v5, v22, v2 -; SDAG-NEXT: v_xor_b32_e32 v1, v19, v3 +; SDAG-NEXT: v_xor_b32_e32 v1, v21, v3 ; SDAG-NEXT: v_xor_b32_e32 v0, v23, v2 ; SDAG-NEXT: v_xor_b32_e32 v10, v13, v7 ; SDAG-NEXT: v_xor_b32_e32 v9, v9, v6 diff --git a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll index 8c3d20ffb02fd..948ce6216998a 100644 --- a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll +++ b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll @@ -30,8 +30,8 @@ define amdgpu_ps void @main(i32 %0, float %1) { ; ISA-NEXT: .LBB0_1: ; %Flow1 ; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; ISA-NEXT: s_or_b64 exec, exec, s[4:5] -; ISA-NEXT: s_mov_b64 s[8:9], 0 ; ISA-NEXT: s_mov_b64 s[4:5], s[6:7] +; ISA-NEXT: s_mov_b64 s[8:9], 0 ; ISA-NEXT: .LBB0_2: ; %Flow ; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; ISA-NEXT: s_and_b64 s[6:7], exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index 56ad91dd59ffb..8d4babb50979f 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -5407,54 +5407,50 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_6 -; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB24_3: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB24_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: .LBB24_4: ; %atomicrmw.start +; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX90A-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_4 -; GFX90A-NEXT: ; %bb.5: ; %Flow +; GFX90A-NEXT: s_cbranch_execnz .LBB24_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB24_4: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB24_2 -; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_cbranch_execz .LBB24_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index f0083bd23660a..85a62d8b22a4e 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -5407,54 +5407,50 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_6 -; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB24_3: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB24_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: .LBB24_4: ; %atomicrmw.start +; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX90A-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX90A-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_4 -; GFX90A-NEXT: ; %bb.5: ; %Flow +; GFX90A-NEXT: s_cbranch_execnz .LBB24_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB24_4: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB24_2 -; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_cbranch_execz .LBB24_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX90A-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 1f74fbdc46e98..0b8f475738794 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -711,20 +711,20 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB3_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -738,21 +738,21 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB3_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -892,20 +892,20 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB4_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -919,21 +919,21 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB4_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1074,7 +1074,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -1083,15 +1083,15 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB5_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1105,7 +1105,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX6-NEXT: s_mov_b64 s[8:9], 0 @@ -1114,16 +1114,16 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB5_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1457,20 +1457,20 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB7_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1484,21 +1484,21 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB7_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2059,20 +2059,20 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_deno ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2086,21 +2086,21 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_deno ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2818,20 +2818,20 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %p ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2845,21 +2845,21 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %p ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3000,20 +3000,20 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory( ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB15_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3027,21 +3027,21 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory( ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB15_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3156,20 +3156,20 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory_ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3183,21 +3183,21 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory_ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3352,20 +3352,20 @@ define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3379,21 +3379,21 @@ define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3747,20 +3747,20 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr add ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3774,21 +3774,21 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr add ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4141,20 +4141,20 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4168,21 +4168,21 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4505,20 +4505,20 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB23_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4532,21 +4532,21 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB23_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5189,20 +5189,20 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5216,21 +5216,21 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB27_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5344,20 +5344,20 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB28_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5371,21 +5371,21 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB28_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5500,7 +5500,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -5509,15 +5509,15 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5531,7 +5531,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX6-NEXT: s_mov_b64 s[8:9], 0 @@ -5540,16 +5540,16 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB29_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5843,20 +5843,20 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5870,21 +5870,21 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB31_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6172,20 +6172,20 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB33_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6199,21 +6199,21 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB33_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6567,20 +6567,20 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(pt ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB35_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6594,21 +6594,21 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(pt ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB35_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6891,20 +6891,20 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB37_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6918,21 +6918,21 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB37_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7769,23 +7769,23 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v11, v7 -; GFX7-NEXT: v_mov_b32_e32 v10, v6 ; GFX7-NEXT: v_mov_b32_e32 v9, v5 ; GFX7-NEXT: v_mov_b32_e32 v8, v4 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v4, v6 +; GFX7-NEXT: v_mov_b32_e32 v5, v7 +; GFX7-NEXT: v_mov_b32_e32 v6, v8 +; GFX7-NEXT: v_mov_b32_e32 v7, v9 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v6, v8 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9] ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v7, v9 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB41_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7799,24 +7799,24 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v11, v7 -; GFX6-NEXT: v_mov_b32_e32 v10, v6 ; GFX6-NEXT: v_mov_b32_e32 v9, v5 ; GFX6-NEXT: v_mov_b32_e32 v8, v4 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v4, v6 +; GFX6-NEXT: v_mov_b32_e32 v5, v7 +; GFX6-NEXT: v_mov_b32_e32 v6, v8 +; GFX6-NEXT: v_mov_b32_e32 v7, v9 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; GFX6-NEXT: v_mov_b32_e32 v6, v8 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9] ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v7, v9 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB41_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7975,23 +7975,23 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040 +; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:2040 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v11, v7 -; GFX7-NEXT: v_mov_b32_e32 v10, v6 ; GFX7-NEXT: v_mov_b32_e32 v9, v5 ; GFX7-NEXT: v_mov_b32_e32 v8, v4 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc +; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v4, v6 +; GFX7-NEXT: v_mov_b32_e32 v5, v7 +; GFX7-NEXT: v_mov_b32_e32 v6, v8 +; GFX7-NEXT: v_mov_b32_e32 v7, v9 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:2040 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v6, v8 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9] ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v7, v9 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8005,24 +8005,24 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040 +; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:2040 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v11, v7 -; GFX6-NEXT: v_mov_b32_e32 v10, v6 ; GFX6-NEXT: v_mov_b32_e32 v9, v5 ; GFX6-NEXT: v_mov_b32_e32 v8, v4 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v4, v6 +; GFX6-NEXT: v_mov_b32_e32 v5, v7 +; GFX6-NEXT: v_mov_b32_e32 v6, v8 +; GFX6-NEXT: v_mov_b32_e32 v7, v9 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:2040 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; GFX6-NEXT: v_mov_b32_e32 v6, v8 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9] ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v7, v9 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB42_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8182,7 +8182,7 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -8191,18 +8191,18 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v11, v7 -; GFX7-NEXT: v_mov_b32_e32 v10, v6 ; GFX7-NEXT: v_mov_b32_e32 v9, v5 ; GFX7-NEXT: v_mov_b32_e32 v8, v4 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v4, v6 +; GFX7-NEXT: v_mov_b32_e32 v5, v7 +; GFX7-NEXT: v_mov_b32_e32 v6, v8 +; GFX7-NEXT: v_mov_b32_e32 v7, v9 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v6, v8 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9] ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v7, v9 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB43_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8216,7 +8216,7 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX6-NEXT: s_mov_b64 s[8:9], 0 @@ -8225,19 +8225,19 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v11, v7 -; GFX6-NEXT: v_mov_b32_e32 v10, v6 ; GFX6-NEXT: v_mov_b32_e32 v9, v5 ; GFX6-NEXT: v_mov_b32_e32 v8, v4 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v4, v6 +; GFX6-NEXT: v_mov_b32_e32 v5, v7 +; GFX6-NEXT: v_mov_b32_e32 v6, v8 +; GFX6-NEXT: v_mov_b32_e32 v7, v9 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; GFX6-NEXT: v_mov_b32_e32 v6, v8 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9] ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v7, v9 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB43_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8597,36 +8597,36 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 -; GFX7-NEXT: v_not_b32_e32 v7, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v2 +; GFX7-NEXT: v_not_b32_e32 v6, v5 ; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v3, v4, v7 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, v3 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v2, v4 +; GFX7-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: v_mov_b32_e32 v5, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -8640,36 +8640,37 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 -; GFX6-NEXT: v_not_b32_e32 v7, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v2 +; GFX6-NEXT: v_not_b32_e32 v6, v5 ; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v3, v4, v7 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, v3 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v8, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v2, v4 +; GFX6-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: v_mov_b32_e32 v5, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB44_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -9032,36 +9033,36 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX7-NEXT: v_not_b32_e32 v8, v2 ; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB45_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -9076,37 +9077,37 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX6-NEXT: v_not_b32_e32 v8, v2 ; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB45_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -9471,36 +9472,36 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX7-NEXT: v_not_b32_e32 v8, v2 ; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -9515,37 +9516,37 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX6-NEXT: v_not_b32_e32 v8, v2 ; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB46_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -9888,28 +9889,28 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v2 +; GFX7-NEXT: v_not_b32_e32 v6, v5 ; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v8, v4 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v2, v4 +; GFX7-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: v_mov_b32_e32 v5, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9929,29 +9930,29 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v2 +; GFX6-NEXT: v_not_b32_e32 v6, v5 ; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX6-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v8, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v2, v4 +; GFX6-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: v_mov_b32_e32 v5, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB47_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10304,31 +10305,31 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX7-NEXT: v_not_b32_e32 v6, v2 ; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10346,32 +10347,32 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_not_b32_e32 v6, v2 ; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB48_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10726,31 +10727,31 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX7-NEXT: v_not_b32_e32 v6, v2 ; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB49_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10768,32 +10769,32 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_not_b32_e32 v6, v2 ; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB49_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11066,28 +11067,28 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB50_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -11100,29 +11101,29 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB50_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 @@ -11379,23 +11380,23 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB51_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11412,24 +11413,24 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB51_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11800,36 +11801,36 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX7-NEXT: v_not_b32_e32 v8, v2 ; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -11844,37 +11845,37 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX6-NEXT: v_not_b32_e32 v8, v2 ; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB52_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -12229,31 +12230,31 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX7-NEXT: v_not_b32_e32 v6, v2 ; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB53_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12271,32 +12272,32 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_not_b32_e32 v6, v2 ; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB53_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12735,35 +12736,35 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: v_not_b32_e32 v6, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v3, v4, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, v3 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX7-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: v_mov_b32_e32 v5, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB54_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -12778,35 +12779,36 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v3 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v6, v3 +; GFX6-NEXT: v_not_b32_e32 v6, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, v4, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, v3 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v8, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX6-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: v_mov_b32_e32 v5, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB54_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -13251,35 +13253,35 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v7, v4 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB55_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -13295,36 +13297,36 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v7, v4 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB55_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -13771,35 +13773,35 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v7, v4 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB56_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -13815,36 +13817,36 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v7, v4 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB56_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -14263,30 +14265,30 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: v_not_b32_e32 v6, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v8, v4 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX7-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: v_mov_b32_e32 v5, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB57_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14304,31 +14306,31 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v3 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v6, v3 +; GFX6-NEXT: v_not_b32_e32 v6, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v8, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX6-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: v_mov_b32_e32 v5, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB57_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14763,30 +14765,30 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB58_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14805,31 +14807,31 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB58_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15266,30 +15268,30 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB59_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15308,31 +15310,31 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB59_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15692,28 +15694,28 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB60_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -15726,29 +15728,29 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB60_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 @@ -16092,23 +16094,23 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB61_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16125,24 +16127,24 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB61_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16595,35 +16597,35 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v7, v4 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB62_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB62_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -16639,36 +16641,36 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v7, v4 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB62_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB62_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -17105,30 +17107,30 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB63_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB63_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17147,31 +17149,31 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB63_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB63_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index faa74fef2be2f..b100e6899a740 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -4806,36 +4806,36 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 -; GFX7-NEXT: v_not_b32_e32 v7, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v2 +; GFX7-NEXT: v_not_b32_e32 v6, v5 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v3, v4, v7 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, v3 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX7-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v2, v4 +; GFX7-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: v_mov_b32_e32 v5, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -4849,36 +4849,37 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 -; GFX6-NEXT: v_not_b32_e32 v7, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v2 +; GFX6-NEXT: v_not_b32_e32 v6, v5 ; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v3, v4, v7 -; GFX6-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, v3 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v8, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX6-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v2, v4 +; GFX6-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: v_mov_b32_e32 v5, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB26_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -5264,36 +5265,36 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX7-NEXT: v_not_b32_e32 v8, v2 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -5308,37 +5309,37 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX6-NEXT: v_not_b32_e32 v8, v2 ; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX6-NEXT: v_max_f32_e32 v2, v2, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB27_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -5726,36 +5727,36 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX7-NEXT: v_not_b32_e32 v8, v2 ; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB28_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -5770,37 +5771,37 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX6-NEXT: v_not_b32_e32 v8, v2 ; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX6-NEXT: v_max_f32_e32 v2, v2, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB28_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -6165,28 +6166,28 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v2 +; GFX7-NEXT: v_not_b32_e32 v6, v5 ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_max_f32_e32 v3, v3, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v8, v4 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX7-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v2, v4 +; GFX7-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: v_mov_b32_e32 v5, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6206,29 +6207,29 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v2 +; GFX6-NEXT: v_not_b32_e32 v6, v5 ; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX6-NEXT: v_max_f32_e32 v3, v3, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v8, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX6-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v2, v4 +; GFX6-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: v_mov_b32_e32 v5, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB29_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6608,31 +6609,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX7-NEXT: v_not_b32_e32 v6, v2 ; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB30_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6650,32 +6651,32 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_not_b32_e32 v6, v2 ; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX6-NEXT: v_max_f32_e32 v2, v2, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB30_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7057,31 +7058,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX7-NEXT: v_not_b32_e32 v6, v2 ; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7099,32 +7100,32 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_not_b32_e32 v6, v2 ; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX6-NEXT: v_max_f32_e32 v2, v2, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB31_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7415,28 +7416,28 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -7449,29 +7450,29 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB32_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 @@ -7750,23 +7751,23 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB33_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7783,24 +7784,24 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX6-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB33_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8194,36 +8195,36 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX7-NEXT: v_not_b32_e32 v8, v2 ; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB34_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -8238,37 +8239,37 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX6-NEXT: v_not_b32_e32 v8, v2 ; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX6-NEXT: v_max_f32_e32 v2, v2, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB34_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -8650,31 +8651,31 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX7-NEXT: v_not_b32_e32 v6, v2 ; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB35_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8692,32 +8693,32 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_not_b32_e32 v6, v2 ; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX6-NEXT: v_max_f32_e32 v2, v2, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB35_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9156,36 +9157,36 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: v_not_b32_e32 v6, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v3, v4, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, v3 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX7-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: v_mov_b32_e32 v5, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB36_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -9200,36 +9201,37 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v3 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v6, v3 +; GFX6-NEXT: v_not_b32_e32 v6, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_max_f32_e32 v2, v2, v7 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, v4, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, v3 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v8, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX6-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: v_mov_b32_e32 v5, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB36_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -9674,36 +9676,36 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v7, v4 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB37_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -9719,37 +9721,37 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v7, v4 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_max_f32_e32 v2, v2, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB37_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -10196,36 +10198,36 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v7, v4 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB38_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -10241,37 +10243,37 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v7, v4 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_max_f32_e32 v2, v2, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB38_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -10690,31 +10692,31 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: v_not_b32_e32 v6, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v8, v4 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX7-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: v_mov_b32_e32 v5, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB39_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10732,32 +10734,32 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v3 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v6, v3 +; GFX6-NEXT: v_not_b32_e32 v6, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v8, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX6-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: v_mov_b32_e32 v5, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB39_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11192,31 +11194,31 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB40_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11235,32 +11237,32 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB40_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11697,31 +11699,31 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB41_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11740,32 +11742,32 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB41_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12125,29 +12127,29 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -12160,30 +12162,30 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB42_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 @@ -12527,24 +12529,24 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB43_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12561,25 +12563,25 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB43_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13032,36 +13034,36 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v7, v4 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -13077,37 +13079,37 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v7, v4 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_max_f32_e32 v2, v2, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB44_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -13544,31 +13546,31 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB45_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13587,32 +13589,32 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB45_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index a46b0129b79e6..7cdd1734abf53 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -4806,36 +4806,36 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 -; GFX7-NEXT: v_not_b32_e32 v7, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v2 +; GFX7-NEXT: v_not_b32_e32 v6, v5 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v3, v4, v7 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, v3 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX7-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v2, v4 +; GFX7-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: v_mov_b32_e32 v5, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -4849,36 +4849,37 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 -; GFX6-NEXT: v_not_b32_e32 v7, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v2 +; GFX6-NEXT: v_not_b32_e32 v6, v5 ; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v3, v4, v7 -; GFX6-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, v3 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v8, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX6-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v2, v4 +; GFX6-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: v_mov_b32_e32 v5, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB26_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -5264,36 +5265,36 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX7-NEXT: v_not_b32_e32 v8, v2 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -5308,37 +5309,37 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX6-NEXT: v_not_b32_e32 v8, v2 ; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX6-NEXT: v_min_f32_e32 v2, v2, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB27_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -5726,36 +5727,36 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX7-NEXT: v_not_b32_e32 v8, v2 ; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB28_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -5770,37 +5771,37 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX6-NEXT: v_not_b32_e32 v8, v2 ; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX6-NEXT: v_min_f32_e32 v2, v2, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB28_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -6165,28 +6166,28 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v2 +; GFX7-NEXT: v_not_b32_e32 v6, v5 ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_min_f32_e32 v3, v3, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v8, v4 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX7-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v2, v4 +; GFX7-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: v_mov_b32_e32 v5, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6206,29 +6207,29 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v2 +; GFX6-NEXT: v_not_b32_e32 v6, v5 ; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX6-NEXT: v_min_f32_e32 v3, v3, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v8, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX6-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v2, v4 +; GFX6-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: v_mov_b32_e32 v5, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB29_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6608,31 +6609,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX7-NEXT: v_not_b32_e32 v6, v2 ; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB30_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6650,32 +6651,32 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_not_b32_e32 v6, v2 ; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX6-NEXT: v_min_f32_e32 v2, v2, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB30_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7057,31 +7058,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX7-NEXT: v_not_b32_e32 v6, v2 ; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7099,32 +7100,32 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_not_b32_e32 v6, v2 ; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX6-NEXT: v_min_f32_e32 v2, v2, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB31_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7415,28 +7416,28 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -7449,29 +7450,29 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB32_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 @@ -7750,23 +7751,23 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB33_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7783,24 +7784,24 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX6-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB33_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8194,36 +8195,36 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX7-NEXT: v_not_b32_e32 v8, v2 ; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB34_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -8238,37 +8239,37 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX6-NEXT: v_not_b32_e32 v8, v2 ; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX6-NEXT: v_min_f32_e32 v2, v2, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB34_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -8650,31 +8651,31 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX7-NEXT: v_not_b32_e32 v6, v2 ; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB35_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8692,32 +8693,32 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_not_b32_e32 v6, v2 ; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX6-NEXT: v_min_f32_e32 v2, v2, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB35_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9156,36 +9157,36 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: v_not_b32_e32 v6, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v3, v4, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, v3 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX7-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: v_mov_b32_e32 v5, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB36_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -9200,36 +9201,37 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v3 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v6, v3 +; GFX6-NEXT: v_not_b32_e32 v6, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_min_f32_e32 v2, v2, v7 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, v4, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, v3 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v8, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX6-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: v_mov_b32_e32 v5, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB36_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -9674,36 +9676,36 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v7, v4 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB37_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -9719,37 +9721,37 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v7, v4 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_min_f32_e32 v2, v2, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB37_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -10196,36 +10198,36 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v7, v4 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB38_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -10241,37 +10243,37 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v7, v4 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_min_f32_e32 v2, v2, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB38_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -10690,31 +10692,31 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: v_not_b32_e32 v6, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v8, v4 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX7-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: v_mov_b32_e32 v5, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB39_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10732,32 +10734,32 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v3 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v6, v3 +; GFX6-NEXT: v_not_b32_e32 v6, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v8, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX6-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: v_mov_b32_e32 v5, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB39_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11192,31 +11194,31 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB40_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11235,32 +11237,32 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB40_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11697,31 +11699,31 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB41_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11740,32 +11742,32 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB41_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12125,29 +12127,29 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -12160,30 +12162,30 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB42_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 @@ -12527,24 +12529,24 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB43_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12561,25 +12563,25 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB43_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13032,36 +13034,36 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v7, v4 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -13077,37 +13079,37 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v7, v4 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_min_f32_e32 v2, v2, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB44_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -13544,31 +13546,31 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB45_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13587,32 +13589,32 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB45_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index 053efdcb76261..8c7b481099a66 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -894,20 +894,20 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB3_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -921,21 +921,21 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_sub_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB3_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1117,20 +1117,20 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB4_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1144,21 +1144,21 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_sub_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB4_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1341,7 +1341,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -1350,15 +1350,15 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB5_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1372,7 +1372,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX6-NEXT: s_mov_b64 s[8:9], 0 @@ -1381,16 +1381,16 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_sub_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB5_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1813,20 +1813,20 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB7_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1840,21 +1840,21 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_sub_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB7_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2749,20 +2749,20 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2776,21 +2776,21 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_sub_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2972,20 +2972,20 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2999,21 +2999,21 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_sub_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3196,7 +3196,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -3205,15 +3205,15 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3227,7 +3227,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX6-NEXT: s_mov_b64 s[8:9], 0 @@ -3236,16 +3236,16 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_sub_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3668,20 +3668,20 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB15_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3695,21 +3695,21 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_sub_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB15_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4663,23 +4663,23 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v11, v7 -; GFX7-NEXT: v_mov_b32_e32 v10, v6 ; GFX7-NEXT: v_mov_b32_e32 v9, v5 ; GFX7-NEXT: v_mov_b32_e32 v8, v4 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v4, v6 +; GFX7-NEXT: v_mov_b32_e32 v5, v7 +; GFX7-NEXT: v_mov_b32_e32 v6, v8 +; GFX7-NEXT: v_mov_b32_e32 v7, v9 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v6, v8 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9] ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v7, v9 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4693,24 +4693,24 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v11, v7 -; GFX6-NEXT: v_mov_b32_e32 v10, v6 ; GFX6-NEXT: v_mov_b32_e32 v9, v5 ; GFX6-NEXT: v_mov_b32_e32 v8, v4 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v4, v6 +; GFX6-NEXT: v_mov_b32_e32 v5, v7 +; GFX6-NEXT: v_mov_b32_e32 v6, v8 +; GFX6-NEXT: v_mov_b32_e32 v7, v9 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; GFX6-NEXT: v_mov_b32_e32 v6, v8 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9] ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v7, v9 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4895,23 +4895,23 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040 +; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:2040 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v11, v7 -; GFX7-NEXT: v_mov_b32_e32 v10, v6 ; GFX7-NEXT: v_mov_b32_e32 v9, v5 ; GFX7-NEXT: v_mov_b32_e32 v8, v4 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc +; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v4, v6 +; GFX7-NEXT: v_mov_b32_e32 v5, v7 +; GFX7-NEXT: v_mov_b32_e32 v6, v8 +; GFX7-NEXT: v_mov_b32_e32 v7, v9 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:2040 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v6, v8 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9] ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v7, v9 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4925,24 +4925,24 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040 +; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:2040 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v11, v7 -; GFX6-NEXT: v_mov_b32_e32 v10, v6 ; GFX6-NEXT: v_mov_b32_e32 v9, v5 ; GFX6-NEXT: v_mov_b32_e32 v8, v4 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v4, v6 +; GFX6-NEXT: v_mov_b32_e32 v5, v7 +; GFX6-NEXT: v_mov_b32_e32 v6, v8 +; GFX6-NEXT: v_mov_b32_e32 v7, v9 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:2040 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; GFX6-NEXT: v_mov_b32_e32 v6, v8 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9] ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v7, v9 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5128,7 +5128,7 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -5137,18 +5137,18 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v11, v7 -; GFX7-NEXT: v_mov_b32_e32 v10, v6 ; GFX7-NEXT: v_mov_b32_e32 v9, v5 ; GFX7-NEXT: v_mov_b32_e32 v8, v4 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v4, v6 +; GFX7-NEXT: v_mov_b32_e32 v5, v7 +; GFX7-NEXT: v_mov_b32_e32 v6, v8 +; GFX7-NEXT: v_mov_b32_e32 v7, v9 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v6, v8 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9] ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v7, v9 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5162,7 +5162,7 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX6-NEXT: s_mov_b64 s[8:9], 0 @@ -5171,19 +5171,19 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v11, v7 -; GFX6-NEXT: v_mov_b32_e32 v10, v6 ; GFX6-NEXT: v_mov_b32_e32 v9, v5 ; GFX6-NEXT: v_mov_b32_e32 v8, v4 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v4, v6 +; GFX6-NEXT: v_mov_b32_e32 v5, v7 +; GFX6-NEXT: v_mov_b32_e32 v6, v8 +; GFX6-NEXT: v_mov_b32_e32 v7, v9 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; GFX6-NEXT: v_mov_b32_e32 v6, v8 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9] ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v7, v9 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5543,36 +5543,36 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 -; GFX7-NEXT: v_not_b32_e32 v7, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v2 +; GFX7-NEXT: v_not_b32_e32 v6, v5 ; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v3, v4, v7 -; GFX7-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, v3 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX7-NEXT: v_sub_f32_e32 v4, v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v2, v4 +; GFX7-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: v_mov_b32_e32 v5, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -5586,36 +5586,37 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 -; GFX6-NEXT: v_not_b32_e32 v7, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v2 +; GFX6-NEXT: v_not_b32_e32 v6, v5 ; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v3, v4, v7 -; GFX6-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, v3 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v8, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX6-NEXT: v_sub_f32_e32 v4, v4, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v2, v4 +; GFX6-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: v_mov_b32_e32 v5, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB22_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -5978,36 +5979,36 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX7-NEXT: v_not_b32_e32 v8, v2 ; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX7-NEXT: v_sub_f32_e32 v2, v2, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_sub_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB23_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -6022,37 +6023,37 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX6-NEXT: v_not_b32_e32 v8, v2 ; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX6-NEXT: v_sub_f32_e32 v2, v2, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_sub_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB23_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -6417,36 +6418,36 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX7-NEXT: v_not_b32_e32 v8, v2 ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX7-NEXT: v_sub_f32_e32 v2, v2, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_sub_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB24_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -6461,37 +6462,37 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX6-NEXT: v_not_b32_e32 v8, v2 ; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX6-NEXT: v_sub_f32_e32 v2, v2, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_sub_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB24_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -6834,28 +6835,28 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v2 +; GFX7-NEXT: v_not_b32_e32 v6, v5 ; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_sub_f32_e32 v3, v3, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v8, v4 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX7-NEXT: v_sub_f32_e32 v4, v4, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v2, v4 +; GFX7-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: v_mov_b32_e32 v5, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB25_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6875,29 +6876,29 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v2 +; GFX6-NEXT: v_not_b32_e32 v6, v5 ; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX6-NEXT: v_sub_f32_e32 v3, v3, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v8, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX6-NEXT: v_sub_f32_e32 v4, v4, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v2, v4 +; GFX6-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: v_mov_b32_e32 v5, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB25_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7250,31 +7251,31 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX7-NEXT: v_not_b32_e32 v6, v2 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX7-NEXT: v_sub_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_sub_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7292,32 +7293,32 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_not_b32_e32 v6, v2 ; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX6-NEXT: v_sub_f32_e32 v2, v2, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_sub_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB26_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7672,31 +7673,31 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX7-NEXT: v_not_b32_e32 v6, v2 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX7-NEXT: v_sub_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_sub_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7714,32 +7715,32 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_not_b32_e32 v6, v2 ; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX6-NEXT: v_sub_f32_e32 v2, v2, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_sub_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB27_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8012,28 +8013,28 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB28_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4: @@ -8046,29 +8047,29 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB28_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 @@ -8325,23 +8326,23 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX7-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8358,24 +8359,24 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX6-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB29_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8746,36 +8747,36 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX7-NEXT: v_not_b32_e32 v8, v2 ; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX7-NEXT: v_sub_f32_e32 v2, v2, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_sub_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB30_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -8790,37 +8791,37 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX6-NEXT: v_not_b32_e32 v8, v2 ; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX6-NEXT: v_sub_f32_e32 v2, v2, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_sub_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB30_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -9175,31 +9176,31 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX7-NEXT: v_not_b32_e32 v6, v2 ; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX7-NEXT: v_sub_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_sub_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9217,32 +9218,32 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_not_b32_e32 v6, v2 ; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX6-NEXT: v_sub_f32_e32 v2, v2, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_sub_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB31_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9681,35 +9682,35 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: v_not_b32_e32 v6, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_sub_f32_e32 v2, v2, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v3, v4, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, v3 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX7-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: v_mov_b32_e32 v5, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -9724,35 +9725,36 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v3 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v6, v3 +; GFX6-NEXT: v_not_b32_e32 v6, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_sub_f32_e32 v2, v2, v7 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, v4, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, v3 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v8, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX6-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: v_mov_b32_e32 v5, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB32_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -10197,35 +10199,35 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v7, v4 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_sub_f32_e32 v2, v2, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB33_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -10241,36 +10243,36 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v7, v4 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_sub_f32_e32 v2, v2, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB33_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -10717,35 +10719,35 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v7, v4 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_sub_f32_e32 v2, v2, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB34_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -10761,36 +10763,36 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v7, v4 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_sub_f32_e32 v2, v2, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB34_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -11209,30 +11211,30 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: v_not_b32_e32 v6, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v8, v4 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX7-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: v_mov_b32_e32 v5, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB35_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11250,31 +11252,31 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v3 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v6, v3 +; GFX6-NEXT: v_not_b32_e32 v6, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v8, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v5, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX6-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: v_mov_b32_e32 v5, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB35_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11709,30 +11711,30 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB36_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11751,31 +11753,31 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB36_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12212,30 +12214,30 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB37_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12254,31 +12256,31 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB37_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12638,28 +12640,28 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB38_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: @@ -12672,29 +12674,29 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB38_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 @@ -13038,23 +13040,23 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB39_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13071,24 +13073,24 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB39_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13541,35 +13543,35 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v7, v4 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_sub_f32_e32 v2, v2, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB40_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -13585,36 +13587,36 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v7, v4 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_sub_f32_e32 v2, v2, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB40_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -14051,30 +14053,30 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v8 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB41_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14093,31 +14095,31 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_or_b32_e32 v7, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v8 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB41_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll index a867c6c1affb8..713959fc5e2b4 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -2167,22 +2167,22 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: .LBB51_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, v4, v2 -; SI-NEXT: v_not_b32_e32 v3, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v6, v4 ; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, v5, v2 +; SI-NEXT: v_not_b32_e32 v4, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB51_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2245,22 +2245,22 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: .LBB52_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, v4, v2 -; SI-NEXT: v_not_b32_e32 v3, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v6, v4 ; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, v5, v2 +; SI-NEXT: v_not_b32_e32 v4, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB52_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2487,38 +2487,38 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v4, s6, 0 -; SI-NEXT: v_writelane_b32 v4, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB55_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, s34, v1 -; SI-NEXT: v_not_b32_e32 v0, v0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, s34, v2 +; SI-NEXT: v_not_b32_e32 v1, v0 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB55_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v4, 1 -; SI-NEXT: v_readlane_b32 s6, v4, 0 +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2578,38 +2578,38 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v4, s6, 0 -; SI-NEXT: v_writelane_b32 v4, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB56_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, s34, v1 -; SI-NEXT: v_not_b32_e32 v0, v0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, s34, v2 +; SI-NEXT: v_not_b32_e32 v1, v0 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB56_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v4, 1 -; SI-NEXT: v_readlane_b32 s6, v4, 0 +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2862,22 +2862,22 @@ define void @global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory(ptr ad ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: .LBB59_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, v4, v2 -; SI-NEXT: v_not_b32_e32 v3, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v6, v4 ; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, v5, v2 +; SI-NEXT: v_not_b32_e32 v4, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB59_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3950,21 +3950,21 @@ define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: .LBB83_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_max_i32_e32 v3, v4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v6, v4 ; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_max_i32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB83_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4025,21 +4025,21 @@ define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: .LBB84_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_max_i32_e32 v3, v4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v6, v4 ; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_max_i32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB84_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4258,37 +4258,37 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v4, s6, 0 -; SI-NEXT: v_writelane_b32 v4, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB87_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_max_i32_e32 v0, s34, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_max_i32_e32 v1, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB87_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v4, 1 -; SI-NEXT: v_readlane_b32 s6, v4, 0 +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -4346,37 +4346,37 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v4, s6, 0 -; SI-NEXT: v_writelane_b32 v4, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB88_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_max_i32_e32 v0, s34, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_max_i32_e32 v1, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB88_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v4, 1 -; SI-NEXT: v_readlane_b32 s6, v4, 0 +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -4627,20 +4627,20 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB91_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_max_i32_e32 v0, s2, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_max_i32_e32 v1, s2, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB91_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4726,20 +4726,20 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB92_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_max_i32_e32 v0, s8, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_max_i32_e32 v1, s8, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB92_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4748,7 +4748,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_max_i32_ret_addr64_offset: @@ -4840,20 +4840,20 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB93_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_max_i32_e32 v0, s2, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_max_i32_e32 v1, s2, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB93_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4936,20 +4936,20 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB94_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_max_i32_e32 v0, s8, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_max_i32_e32 v1, s8, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB94_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4958,7 +4958,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_max_i32_ret_addr64: @@ -5041,21 +5041,21 @@ define void @global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory(ptr add ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: .LBB95_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_max_i32_e32 v3, v4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v6, v4 ; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_max_i32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB95_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5203,21 +5203,21 @@ define void @global_atomic_umax_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: .LBB97_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_max_u32_e32 v3, v4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v6, v4 ; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_max_u32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB97_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5278,21 +5278,21 @@ define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: .LBB98_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_max_u32_e32 v3, v4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v6, v4 ; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_max_u32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB98_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5511,37 +5511,37 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v4, s6, 0 -; SI-NEXT: v_writelane_b32 v4, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB101_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_max_u32_e32 v0, s34, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_max_u32_e32 v1, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB101_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v4, 1 -; SI-NEXT: v_readlane_b32 s6, v4, 0 +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -5599,37 +5599,37 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v4, s6, 0 -; SI-NEXT: v_writelane_b32 v4, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB102_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_max_u32_e32 v0, s34, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_max_u32_e32 v1, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB102_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v4, 1 -; SI-NEXT: v_readlane_b32 s6, v4, 0 +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -5880,20 +5880,20 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB105_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_max_u32_e32 v0, s2, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_max_u32_e32 v1, s2, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB105_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5979,20 +5979,20 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB106_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_max_u32_e32 v0, s8, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_max_u32_e32 v1, s8, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB106_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6001,7 +6001,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_umax_i32_ret_addr64_offset: @@ -6094,20 +6094,20 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB107_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_max_u32_e32 v0, s8, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_max_u32_e32 v1, s8, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB107_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6116,7 +6116,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_umax_i32_ret_addr64: @@ -6199,21 +6199,21 @@ define void @global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr ad ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: .LBB108_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_max_u32_e32 v3, v4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v6, v4 ; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_max_u32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB108_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6361,21 +6361,21 @@ define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: .LBB110_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_min_u32_e32 v3, v4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v6, v4 ; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_min_u32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB110_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6436,21 +6436,21 @@ define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: .LBB111_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_min_u32_e32 v3, v4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v6, v4 ; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_min_u32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB111_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6669,37 +6669,37 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v4, s6, 0 -; SI-NEXT: v_writelane_b32 v4, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB114_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_min_u32_e32 v0, s34, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_min_u32_e32 v1, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB114_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v4, 1 -; SI-NEXT: v_readlane_b32 s6, v4, 0 +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -6757,37 +6757,37 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v4, s6, 0 -; SI-NEXT: v_writelane_b32 v4, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB115_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_min_u32_e32 v0, s34, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_min_u32_e32 v1, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB115_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v4, 1 -; SI-NEXT: v_readlane_b32 s6, v4, 0 +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -7032,21 +7032,21 @@ define void @global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr ad ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: .LBB118_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_min_u32_e32 v3, v4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v6, v4 ; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_min_u32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB118_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7194,21 +7194,21 @@ define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: .LBB120_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_min_i32_e32 v3, v4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v6, v4 ; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_min_i32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB120_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7269,21 +7269,21 @@ define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: .LBB121_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_min_i32_e32 v3, v4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v6, v4 ; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_min_i32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB121_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7502,37 +7502,37 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v4, s6, 0 -; SI-NEXT: v_writelane_b32 v4, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB124_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_min_i32_e32 v0, s34, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_min_i32_e32 v1, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB124_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v4, 1 -; SI-NEXT: v_readlane_b32 s6, v4, 0 +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -7590,37 +7590,37 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v4, s6, 0 -; SI-NEXT: v_writelane_b32 v4, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB125_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_min_i32_e32 v0, s34, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_min_i32_e32 v1, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB125_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v4, 1 -; SI-NEXT: v_readlane_b32 s6, v4, 0 +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -7871,20 +7871,20 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB128_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_min_i32_e32 v0, s2, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_min_i32_e32 v1, s2, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB128_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7970,20 +7970,20 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB129_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_min_i32_e32 v0, s8, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_min_i32_e32 v1, s8, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB129_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7992,7 +7992,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_min_i32_ret_addr64_offset: @@ -8080,20 +8080,20 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: .LBB130_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_min_i32_e32 v0, s6, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_min_i32_e32 v1, s6, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[4:5] ; SI-NEXT: s_cbranch_execnz .LBB130_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8167,20 +8167,20 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB131_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_min_i32_e32 v0, s8, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_min_i32_e32 v1, s8, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB131_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8189,7 +8189,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_min_i32_ret_addr64: @@ -8272,21 +8272,21 @@ define void @global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory(ptr add ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: .LBB132_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_min_i32_e32 v3, v4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v6, v4 ; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_min_i32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB132_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll index a7f16449f058e..8c10a2cc7f2f8 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll @@ -2205,27 +2205,27 @@ define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: .LBB50_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, v7, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, v6, v2 -; SI-NEXT: v_not_b32_e32 v5, v4 -; SI-NEXT: v_not_b32_e32 v4, v8 -; SI-NEXT: v_mov_b32_e32 v11, v7 -; SI-NEXT: v_mov_b32_e32 v10, v6 ; SI-NEXT: v_mov_b32_e32 v9, v5 ; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, v9, v3 +; SI-NEXT: v_and_b32_e32 v5, v8, v2 +; SI-NEXT: v_not_b32_e32 v7, v4 +; SI-NEXT: v_not_b32_e32 v6, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB50_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2294,27 +2294,27 @@ define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: .LBB51_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, v7, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, v6, v2 -; SI-NEXT: v_not_b32_e32 v5, v4 -; SI-NEXT: v_not_b32_e32 v4, v8 -; SI-NEXT: v_mov_b32_e32 v11, v7 -; SI-NEXT: v_mov_b32_e32 v10, v6 ; SI-NEXT: v_mov_b32_e32 v9, v5 ; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, v9, v3 +; SI-NEXT: v_and_b32_e32 v5, v8, v2 +; SI-NEXT: v_not_b32_e32 v7, v4 +; SI-NEXT: v_not_b32_e32 v6, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB51_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2578,44 +2578,44 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v8, s6, 0 -; SI-NEXT: v_writelane_b32 v8, s7, 1 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 ; SI-NEXT: s_mov_b32 s34, s7 ; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB54_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, s34, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, s35, v2 -; SI-NEXT: v_not_b32_e32 v1, v0 -; SI-NEXT: v_not_b32_e32 v0, v4 -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_mov_b32_e32 v6, v2 ; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: v_mov_b32_e32 v4, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, s34, v5 +; SI-NEXT: v_and_b32_e32 v1, s35, v4 +; SI-NEXT: v_not_b32_e32 v3, v0 +; SI-NEXT: v_not_b32_e32 v2, v1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB54_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v8, 1 -; SI-NEXT: v_readlane_b32 s6, v8, 0 +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2683,44 +2683,44 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v8, s6, 0 -; SI-NEXT: v_writelane_b32 v8, s7, 1 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 ; SI-NEXT: s_mov_b32 s34, s7 ; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: .LBB55_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, s34, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, s35, v2 -; SI-NEXT: v_not_b32_e32 v1, v0 -; SI-NEXT: v_not_b32_e32 v0, v4 -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_mov_b32_e32 v6, v2 ; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: v_mov_b32_e32 v4, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, s34, v5 +; SI-NEXT: v_and_b32_e32 v1, s35, v4 +; SI-NEXT: v_not_b32_e32 v3, v0 +; SI-NEXT: v_not_b32_e32 v2, v1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB55_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v8, 1 -; SI-NEXT: v_readlane_b32 s6, v8, 0 +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3003,27 +3003,27 @@ define void @global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr ad ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: .LBB58_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, v7, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, v6, v2 -; SI-NEXT: v_not_b32_e32 v5, v4 -; SI-NEXT: v_not_b32_e32 v4, v8 -; SI-NEXT: v_mov_b32_e32 v11, v7 -; SI-NEXT: v_mov_b32_e32 v10, v6 ; SI-NEXT: v_mov_b32_e32 v9, v5 ; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, v9, v3 +; SI-NEXT: v_and_b32_e32 v5, v8, v2 +; SI-NEXT: v_not_b32_e32 v7, v4 +; SI-NEXT: v_not_b32_e32 v6, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB58_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4073,26 +4073,26 @@ define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: .LBB80_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v7 -; SI-NEXT: v_mov_b32_e32 v10, v6 ; SI-NEXT: v_mov_b32_e32 v9, v5 ; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB80_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4159,26 +4159,26 @@ define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: .LBB81_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v7 -; SI-NEXT: v_mov_b32_e32 v10, v6 ; SI-NEXT: v_mov_b32_e32 v9, v5 ; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB81_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4443,28 +4443,28 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB84_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB84_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4551,28 +4551,28 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB85_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB85_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4878,26 +4878,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: v_mov_b32_e32 v4, s3 ; SI-NEXT: v_mov_b32_e32 v5, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB88_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB88_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4989,29 +4989,29 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: v_mov_b32_e32 v8, s5 -; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: v_mov_b32_e32 v4, s5 +; SI-NEXT: v_mov_b32_e32 v5, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: .LBB89_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_mov_b32_e32 v6, v2 -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB89_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5020,7 +5020,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_max_i64_ret_addr64_offset: @@ -5119,26 +5119,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; SI-NEXT: v_mov_b32_e32 v4, s3 ; SI-NEXT: v_mov_b32_e32 v5, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB90_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB90_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5227,29 +5227,29 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: v_mov_b32_e32 v8, s5 -; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: v_mov_b32_e32 v4, s5 +; SI-NEXT: v_mov_b32_e32 v5, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: .LBB91_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_mov_b32_e32 v6, v2 -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 glc +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB91_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5258,7 +5258,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_max_i64_ret_addr64: @@ -5347,26 +5347,26 @@ define void @global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr add ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: .LBB92_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v7 -; SI-NEXT: v_mov_b32_e32 v10, v6 ; SI-NEXT: v_mov_b32_e32 v9, v5 ; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB92_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5535,26 +5535,26 @@ define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: .LBB94_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v7 -; SI-NEXT: v_mov_b32_e32 v10, v6 ; SI-NEXT: v_mov_b32_e32 v9, v5 ; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB94_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5621,26 +5621,26 @@ define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: .LBB95_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v7 -; SI-NEXT: v_mov_b32_e32 v10, v6 ; SI-NEXT: v_mov_b32_e32 v9, v5 ; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB95_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5905,28 +5905,28 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB98_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB98_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6013,28 +6013,28 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB99_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB99_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6340,26 +6340,26 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; SI-NEXT: v_mov_b32_e32 v4, s3 ; SI-NEXT: v_mov_b32_e32 v5, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB102_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB102_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6451,29 +6451,29 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: v_mov_b32_e32 v8, s5 -; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: v_mov_b32_e32 v4, s5 +; SI-NEXT: v_mov_b32_e32 v5, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: .LBB103_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_mov_b32_e32 v6, v2 -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB103_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6482,7 +6482,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_umax_i64_ret_addr64_offset: @@ -6577,29 +6577,29 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: v_mov_b32_e32 v8, s5 -; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: v_mov_b32_e32 v4, s5 +; SI-NEXT: v_mov_b32_e32 v5, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: .LBB104_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_mov_b32_e32 v6, v2 -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 glc +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB104_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6608,7 +6608,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_umax_i64_ret_addr64: @@ -6697,26 +6697,26 @@ define void @global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr ad ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: .LBB105_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v7 -; SI-NEXT: v_mov_b32_e32 v10, v6 ; SI-NEXT: v_mov_b32_e32 v9, v5 ; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB105_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6885,26 +6885,26 @@ define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: .LBB107_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v7 -; SI-NEXT: v_mov_b32_e32 v10, v6 ; SI-NEXT: v_mov_b32_e32 v9, v5 ; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB107_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6971,26 +6971,26 @@ define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: .LBB108_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v7 -; SI-NEXT: v_mov_b32_e32 v10, v6 ; SI-NEXT: v_mov_b32_e32 v9, v5 ; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB108_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7255,28 +7255,28 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB111_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB111_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7363,28 +7363,28 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB112_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB112_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7683,26 +7683,26 @@ define void @global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr ad ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: .LBB115_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v7 -; SI-NEXT: v_mov_b32_e32 v10, v6 ; SI-NEXT: v_mov_b32_e32 v9, v5 ; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB115_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7871,26 +7871,26 @@ define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: .LBB117_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v7 -; SI-NEXT: v_mov_b32_e32 v10, v6 ; SI-NEXT: v_mov_b32_e32 v9, v5 ; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB117_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7957,26 +7957,26 @@ define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: .LBB118_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v7 -; SI-NEXT: v_mov_b32_e32 v10, v6 ; SI-NEXT: v_mov_b32_e32 v9, v5 ; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB118_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8241,28 +8241,28 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB121_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB121_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8349,28 +8349,28 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB122_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB122_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8676,26 +8676,26 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: v_mov_b32_e32 v4, s3 ; SI-NEXT: v_mov_b32_e32 v5, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB125_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB125_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8787,29 +8787,29 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: v_mov_b32_e32 v8, s5 -; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: v_mov_b32_e32 v4, s5 +; SI-NEXT: v_mov_b32_e32 v5, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: .LBB126_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_mov_b32_e32 v6, v2 -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB126_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8818,7 +8818,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_min_i64_ret_addr64_offset: @@ -8913,28 +8913,28 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: v_mov_b32_e32 v4, s3 ; SI-NEXT: v_mov_b32_e32 v5, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: v_mov_b32_e32 v3, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: .LBB127_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB127_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9014,29 +9014,29 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: v_mov_b32_e32 v8, s5 -; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: v_mov_b32_e32 v4, s5 +; SI-NEXT: v_mov_b32_e32 v5, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: .LBB128_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v3 -; SI-NEXT: v_mov_b32_e32 v6, v2 -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v4, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 glc +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB128_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9045,7 +9045,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_min_i64_ret_addr64: @@ -9134,26 +9134,26 @@ define void @global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr add ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: .LBB129_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v7 -; SI-NEXT: v_mov_b32_e32 v10, v6 ; SI-NEXT: v_mov_b32_e32 v9, v5 ; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] ; SI-NEXT: s_cbranch_execnz .LBB129_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index 37756d15861be..043619bf00b5f 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -34,19 +34,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 ; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2 ; GFX7LESS-NEXT: .LBB0_3: @@ -205,19 +205,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s6 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB0_2 ; GFX7LESS-DPP-NEXT: .LBB0_3: @@ -416,20 +416,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_4 ; GFX7LESS-NEXT: .LBB1_5: @@ -762,20 +762,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v2, v3, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB1_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1163,19 +1163,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2 ; GFX7LESS-NEXT: .LBB2_3: @@ -1383,19 +1383,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX7LESS-DPP-NEXT: .LBB2_3: @@ -1634,20 +1634,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_4 ; GFX7LESS-NEXT: .LBB3_5: @@ -1980,20 +1980,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v2, v3, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB3_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2381,19 +2381,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2 ; GFX7LESS-NEXT: .LBB4_3: @@ -2631,19 +2631,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX7LESS-DPP-NEXT: .LBB4_3: @@ -2912,20 +2912,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_4 ; GFX7LESS-NEXT: .LBB5_5: @@ -3258,20 +3258,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v2, v3, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB5_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3686,20 +3686,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: .LBB6_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_4 ; GFX7LESS-NEXT: .LBB6_5: @@ -4032,20 +4032,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v2, v3, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB6_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4433,19 +4433,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_2 ; GFX7LESS-NEXT: .LBB7_3: @@ -4683,19 +4683,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX7LESS-DPP-NEXT: .LBB7_3: @@ -4963,20 +4963,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: .LBB8_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_4 ; GFX7LESS-NEXT: .LBB8_5: @@ -5335,20 +5335,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v2, v3, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB8_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5752,23 +5752,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX7LESS-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s7 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0 +; GFX7LESS-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_2 ; GFX7LESS-NEXT: .LBB9_3: @@ -5966,23 +5966,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s7 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v0 +; GFX7LESS-DPP-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX7LESS-DPP-NEXT: .LBB9_3: @@ -6222,23 +6222,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0 +; GFX7LESS-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_4 ; GFX7LESS-NEXT: .LBB10_5: @@ -6612,23 +6612,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc -; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[4:5], v[6:7], v[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7] +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7108,23 +7108,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] ; GFX7LESS-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s9 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0 +; GFX7LESS-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_2 ; GFX7LESS-NEXT: .LBB11_3: @@ -7365,23 +7365,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] ; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v0 +; GFX7LESS-DPP-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX7LESS-DPP-NEXT: .LBB11_3: @@ -7655,23 +7655,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0 +; GFX7LESS-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_4 ; GFX7LESS-NEXT: .LBB12_5: @@ -8045,23 +8045,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc -; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[4:5], v[6:7], v[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7] +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB12_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8541,23 +8541,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] ; GFX7LESS-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s9 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0 +; GFX7LESS-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2 ; GFX7LESS-NEXT: .LBB13_3: @@ -8798,23 +8798,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] ; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v0 +; GFX7LESS-DPP-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX7LESS-DPP-NEXT: .LBB13_3: @@ -9088,23 +9088,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0 +; GFX7LESS-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB14_4 ; GFX7LESS-NEXT: .LBB14_5: @@ -9478,23 +9478,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc -; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[4:5], v[6:7], v[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7] +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB14_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10003,23 +10003,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0 +; GFX7LESS-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB15_4 ; GFX7LESS-NEXT: .LBB15_5: @@ -10393,23 +10393,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc -; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[4:5], v[6:7], v[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7] +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB15_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10889,23 +10889,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] ; GFX7LESS-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s9 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0 +; GFX7LESS-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB16_2 ; GFX7LESS-NEXT: .LBB16_3: @@ -11146,23 +11146,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] ; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v0 +; GFX7LESS-DPP-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX7LESS-DPP-NEXT: .LBB16_3: @@ -11436,23 +11436,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0 +; GFX7LESS-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_4 ; GFX7LESS-NEXT: .LBB17_5: @@ -11826,23 +11826,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc -; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[4:5], v[6:7], v[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7] +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12313,19 +12313,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 ; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB18_2 ; GFX7LESS-NEXT: .LBB18_3: @@ -12484,19 +12484,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s6 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB18_2 ; GFX7LESS-DPP-NEXT: .LBB18_3: @@ -12659,19 +12659,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 ; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB19_2 ; GFX7LESS-NEXT: .LBB19_3: @@ -12830,19 +12830,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s6 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-DPP-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB19_2 ; GFX7LESS-DPP-NEXT: .LBB19_3: diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index 6351bb39e97f5..d1a43ad347319 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -30,20 +30,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-NEXT: v_max_f32_e32 v1, 4.0, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2 ; GFX7LESS-NEXT: .LBB0_3: @@ -155,20 +155,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v1, 4.0, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB0_2 ; GFX7LESS-DPP-NEXT: .LBB0_3: @@ -301,18 +301,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX7LESS-NEXT: .LBB1_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 ; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX7LESS-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB1_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -326,22 +326,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7LESS-NEXT: v_max_f32_e32 v3, v0, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_4 ; GFX7LESS-NEXT: .LBB1_5: @@ -666,20 +666,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7LESS-DPP-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v2, v1, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB1_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1068,20 +1068,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-NEXT: v_max_f32_e32 v1, 4.0, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2 ; GFX7LESS-NEXT: .LBB2_3: @@ -1193,20 +1193,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v1, 4.0, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX7LESS-DPP-NEXT: .LBB2_3: @@ -1340,18 +1340,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX7LESS-NEXT: .LBB3_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 ; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX7LESS-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB3_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -1365,22 +1365,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7LESS-NEXT: v_max_f32_e32 v3, v0, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_4 ; GFX7LESS-NEXT: .LBB3_5: @@ -1705,20 +1705,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7LESS-DPP-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v2, v1, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB3_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2108,20 +2108,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-NEXT: v_max_f32_e32 v1, 4.0, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2 ; GFX7LESS-NEXT: .LBB4_3: @@ -2233,20 +2233,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v1, 4.0, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX7LESS-DPP-NEXT: .LBB4_3: @@ -2379,18 +2379,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX7LESS-NEXT: .LBB5_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 ; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX7LESS-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB5_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -2404,22 +2404,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7LESS-NEXT: v_max_f32_e32 v3, v0, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_4 ; GFX7LESS-NEXT: .LBB5_5: @@ -2744,20 +2744,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7LESS-DPP-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v2, v1, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB5_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3146,24 +3146,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s7 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-NEXT: v_mov_b32_e32 v5, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], 4.0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_2 ; GFX7LESS-NEXT: .LBB6_3: @@ -3312,24 +3312,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s7 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc -; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], 4.0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX7LESS-DPP-NEXT: .LBB6_3: @@ -3499,12 +3499,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX7LESS-NEXT: .LBB7_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 ; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 ; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 @@ -3512,7 +3512,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB7_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -3526,25 +3526,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX7LESS-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX7LESS-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX7LESS-NEXT: v_max_f64 v[6:7], v[0:1], v[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_4 ; GFX7LESS-NEXT: .LBB7_5: @@ -3913,23 +3913,23 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7] +; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[2:3], v[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7] ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4414,24 +4414,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s7 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-NEXT: v_mov_b32_e32 v5, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], 4.0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_2 ; GFX7LESS-NEXT: .LBB8_3: @@ -4580,24 +4580,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s7 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc -; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], 4.0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB8_2 ; GFX7LESS-DPP-NEXT: .LBB8_3: @@ -4767,12 +4767,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX7LESS-NEXT: .LBB9_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 ; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 ; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 @@ -4780,7 +4780,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB9_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -4794,25 +4794,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX7LESS-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX7LESS-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX7LESS-NEXT: v_max_f64 v[6:7], v[0:1], v[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_4 ; GFX7LESS-NEXT: .LBB9_5: @@ -5181,23 +5181,23 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7] +; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[2:3], v[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7] ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5682,24 +5682,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s7 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-NEXT: v_mov_b32_e32 v5, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], 4.0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_2 ; GFX7LESS-NEXT: .LBB10_3: @@ -5848,24 +5848,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s7 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc -; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], 4.0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX7LESS-DPP-NEXT: .LBB10_3: @@ -6035,12 +6035,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX7LESS-NEXT: .LBB11_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 ; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 ; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 @@ -6048,7 +6048,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB11_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -6062,25 +6062,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX7LESS-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX7LESS-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX7LESS-NEXT: v_max_f64 v[6:7], v[0:1], v[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_4 ; GFX7LESS-NEXT: .LBB11_5: @@ -6449,23 +6449,23 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7] +; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[2:3], v[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7] ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6950,20 +6950,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-NEXT: v_max_f32_e32 v1, 4.0, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_2 ; GFX7LESS-NEXT: .LBB12_3: @@ -7075,20 +7075,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v1, 4.0, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB12_2 ; GFX7LESS-DPP-NEXT: .LBB12_3: @@ -7204,20 +7204,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-NEXT: v_max_f32_e32 v1, 4.0, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2 ; GFX7LESS-NEXT: .LBB13_3: @@ -7329,20 +7329,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-DPP-NEXT: v_max_f32_e32 v1, 4.0, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX7LESS-DPP-NEXT: .LBB13_3: diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index a9ac00863cd17..1801da8a7b775 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -30,20 +30,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-NEXT: v_min_f32_e32 v1, 4.0, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2 ; GFX7LESS-NEXT: .LBB0_3: @@ -155,20 +155,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v1, 4.0, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB0_2 ; GFX7LESS-DPP-NEXT: .LBB0_3: @@ -301,18 +301,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX7LESS-NEXT: .LBB1_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 ; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX7LESS-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB1_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -326,22 +326,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7LESS-NEXT: v_min_f32_e32 v3, v0, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_4 ; GFX7LESS-NEXT: .LBB1_5: @@ -666,20 +666,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7LESS-DPP-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v2, v1, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB1_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1068,20 +1068,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-NEXT: v_min_f32_e32 v1, 4.0, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2 ; GFX7LESS-NEXT: .LBB2_3: @@ -1193,20 +1193,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v1, 4.0, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX7LESS-DPP-NEXT: .LBB2_3: @@ -1340,18 +1340,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX7LESS-NEXT: .LBB3_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 ; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX7LESS-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB3_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -1365,22 +1365,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7LESS-NEXT: v_min_f32_e32 v3, v0, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_4 ; GFX7LESS-NEXT: .LBB3_5: @@ -1705,20 +1705,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7LESS-DPP-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v2, v1, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB3_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2108,20 +2108,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-NEXT: v_min_f32_e32 v1, 4.0, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2 ; GFX7LESS-NEXT: .LBB4_3: @@ -2233,20 +2233,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v1, 4.0, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX7LESS-DPP-NEXT: .LBB4_3: @@ -2379,18 +2379,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX7LESS-NEXT: .LBB5_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2 ; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX7LESS-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB5_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -2404,22 +2404,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7LESS-NEXT: v_min_f32_e32 v3, v0, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_4 ; GFX7LESS-NEXT: .LBB5_5: @@ -2744,20 +2744,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7LESS-DPP-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v2, v1, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB5_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3146,24 +3146,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s7 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-NEXT: v_mov_b32_e32 v5, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX7LESS-NEXT: v_min_f64 v[2:3], v[0:1], 4.0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_2 ; GFX7LESS-NEXT: .LBB6_3: @@ -3312,24 +3312,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s7 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc -; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX7LESS-DPP-NEXT: v_min_f64 v[2:3], v[0:1], 4.0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX7LESS-DPP-NEXT: .LBB6_3: @@ -3499,12 +3499,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX7LESS-NEXT: .LBB7_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 ; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 ; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 @@ -3512,7 +3512,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX7LESS-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB7_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -3526,25 +3526,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX7LESS-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX7LESS-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX7LESS-NEXT: v_min_f64 v[6:7], v[0:1], v[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_4 ; GFX7LESS-NEXT: .LBB7_5: @@ -3913,23 +3913,23 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7] +; GFX7LESS-DPP-NEXT: v_min_f64 v[4:5], v[2:3], v[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7] ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4414,24 +4414,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s7 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-NEXT: v_mov_b32_e32 v5, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX7LESS-NEXT: v_min_f64 v[2:3], v[0:1], 4.0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_2 ; GFX7LESS-NEXT: .LBB8_3: @@ -4580,24 +4580,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s7 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc -; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX7LESS-DPP-NEXT: v_min_f64 v[2:3], v[0:1], 4.0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB8_2 ; GFX7LESS-DPP-NEXT: .LBB8_3: @@ -4767,12 +4767,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX7LESS-NEXT: .LBB9_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 ; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 ; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 @@ -4780,7 +4780,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX7LESS-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB9_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -4794,25 +4794,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX7LESS-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX7LESS-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX7LESS-NEXT: v_min_f64 v[6:7], v[0:1], v[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_4 ; GFX7LESS-NEXT: .LBB9_5: @@ -5181,23 +5181,23 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7] +; GFX7LESS-DPP-NEXT: v_min_f64 v[4:5], v[2:3], v[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7] ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5682,24 +5682,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s7 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-NEXT: v_mov_b32_e32 v5, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX7LESS-NEXT: v_min_f64 v[2:3], v[0:1], 4.0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_2 ; GFX7LESS-NEXT: .LBB10_3: @@ -5848,24 +5848,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s7 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc -; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] -; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX7LESS-DPP-NEXT: v_min_f64 v[2:3], v[0:1], 4.0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX7LESS-DPP-NEXT: .LBB10_3: @@ -6035,12 +6035,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX7LESS-NEXT: .LBB11_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 ; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 ; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 @@ -6048,7 +6048,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] +; GFX7LESS-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB11_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -6062,25 +6062,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX7LESS-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX7LESS-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX7LESS-NEXT: v_min_f64 v[6:7], v[0:1], v[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_4 ; GFX7LESS-NEXT: .LBB11_5: @@ -6449,23 +6449,23 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7] +; GFX7LESS-DPP-NEXT: v_min_f64 v[4:5], v[2:3], v[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7] ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6950,20 +6950,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-NEXT: v_min_f32_e32 v1, 4.0, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_2 ; GFX7LESS-NEXT: .LBB12_3: @@ -7075,20 +7075,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v1, 4.0, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB12_2 ; GFX7LESS-DPP-NEXT: .LBB12_3: @@ -7204,20 +7204,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-NEXT: v_min_f32_e32 v1, 4.0, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2 ; GFX7LESS-NEXT: .LBB13_3: @@ -7329,20 +7329,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-DPP-NEXT: v_min_f32_e32 v1, 4.0, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX7LESS-DPP-NEXT: .LBB13_3: diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index 6311143f57260..39c40195d1ac4 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -34,19 +34,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 ; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-NEXT: v_sub_f32_e32 v3, v4, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2 ; GFX7LESS-NEXT: .LBB0_3: @@ -235,19 +235,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s6 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v3, v4, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB0_2 ; GFX7LESS-DPP-NEXT: .LBB0_3: @@ -476,20 +476,20 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-NEXT: v_sub_f32_e32 v3, v4, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_4 ; GFX7LESS-NEXT: .LBB1_5: @@ -848,20 +848,20 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v2, v3, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB1_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1275,19 +1275,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-NEXT: v_sub_f32_e32 v3, v4, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2 ; GFX7LESS-NEXT: .LBB2_3: @@ -1525,19 +1525,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v3, v4, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX7LESS-DPP-NEXT: .LBB2_3: @@ -1806,20 +1806,20 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-NEXT: v_sub_f32_e32 v3, v4, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_4 ; GFX7LESS-NEXT: .LBB3_5: @@ -2178,20 +2178,20 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v2, v3, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB3_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2605,19 +2605,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-NEXT: v_sub_f32_e32 v3, v4, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2 ; GFX7LESS-NEXT: .LBB4_3: @@ -2855,19 +2855,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v3, v4, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX7LESS-DPP-NEXT: .LBB4_3: @@ -3136,20 +3136,20 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-NEXT: v_sub_f32_e32 v3, v4, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_4 ; GFX7LESS-NEXT: .LBB5_5: @@ -3508,20 +3508,20 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v2, v3, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB5_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3962,20 +3962,20 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: .LBB6_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-NEXT: v_sub_f32_e32 v3, v4, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_4 ; GFX7LESS-NEXT: .LBB6_5: @@ -4334,20 +4334,20 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v2, v3, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB6_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4761,19 +4761,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-NEXT: v_sub_f32_e32 v3, v4, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_2 ; GFX7LESS-NEXT: .LBB7_3: @@ -5011,19 +5011,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v3, v4, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX7LESS-DPP-NEXT: .LBB7_3: @@ -5291,20 +5291,20 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: .LBB8_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-NEXT: v_sub_f32_e32 v3, v4, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_4 ; GFX7LESS-NEXT: .LBB8_5: @@ -5663,20 +5663,20 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v2, v3, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB8_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6080,23 +6080,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX7LESS-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s7 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0 +; GFX7LESS-NEXT: v_add_f64 v[6:7], v[8:9], -v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_2 ; GFX7LESS-NEXT: .LBB9_3: @@ -6294,23 +6294,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s7 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v0 +; GFX7LESS-DPP-NEXT: v_add_f64 v[6:7], v[8:9], -v[4:5] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX7LESS-DPP-NEXT: .LBB9_3: @@ -6550,23 +6550,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0 +; GFX7LESS-NEXT: v_add_f64 v[6:7], v[8:9], -v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_4 ; GFX7LESS-NEXT: .LBB10_5: @@ -6940,23 +6940,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc -; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[4:5], v[6:7], -v[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7] +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7436,23 +7436,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] ; GFX7LESS-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s9 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0 +; GFX7LESS-NEXT: v_add_f64 v[6:7], v[8:9], -v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_2 ; GFX7LESS-NEXT: .LBB11_3: @@ -7693,23 +7693,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] ; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v0 +; GFX7LESS-DPP-NEXT: v_add_f64 v[6:7], v[8:9], -v[4:5] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX7LESS-DPP-NEXT: .LBB11_3: @@ -7982,23 +7982,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: .LBB12_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0 +; GFX7LESS-NEXT: v_add_f64 v[6:7], v[8:9], -v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_4 ; GFX7LESS-NEXT: .LBB12_5: @@ -8372,23 +8372,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc -; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[4:5], v[6:7], -v[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7] +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB12_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8868,23 +8868,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] ; GFX7LESS-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s9 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0 +; GFX7LESS-NEXT: v_add_f64 v[6:7], v[8:9], -v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2 ; GFX7LESS-NEXT: .LBB13_3: @@ -9125,23 +9125,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] ; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v0 +; GFX7LESS-DPP-NEXT: v_add_f64 v[6:7], v[8:9], -v[4:5] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX7LESS-DPP-NEXT: .LBB13_3: @@ -9415,23 +9415,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: .LBB14_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0 +; GFX7LESS-NEXT: v_add_f64 v[6:7], v[8:9], -v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB14_4 ; GFX7LESS-NEXT: .LBB14_5: @@ -9805,23 +9805,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc -; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[4:5], v[6:7], -v[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7] +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB14_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10330,23 +10330,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: .LBB15_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0 +; GFX7LESS-NEXT: v_add_f64 v[6:7], v[8:9], -v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB15_4 ; GFX7LESS-NEXT: .LBB15_5: @@ -10720,23 +10720,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc -; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[4:5], v[6:7], -v[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7] +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB15_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11215,23 +11215,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] ; GFX7LESS-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s9 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0 +; GFX7LESS-NEXT: v_add_f64 v[6:7], v[8:9], -v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB16_2 ; GFX7LESS-NEXT: .LBB16_3: @@ -11472,23 +11472,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] ; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-DPP-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v0 +; GFX7LESS-DPP-NEXT: v_add_f64 v[6:7], v[8:9], -v[4:5] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX7LESS-DPP-NEXT: .LBB16_3: @@ -11762,23 +11762,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0 +; GFX7LESS-NEXT: v_add_f64 v[6:7], v[8:9], -v[4:5] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_4 ; GFX7LESS-NEXT: .LBB17_5: @@ -12152,23 +12152,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS-DPP-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc -; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] -; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[4:5], v[6:7], -v[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7] +; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index 17a5f520ff41e..2cf76554078a7 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -5863,12 +5863,6 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: v_mov_b32_e32 v13, s13 ; NOOPT-NEXT: v_mov_b32_e32 v14, s14 ; NOOPT-NEXT: v_mov_b32_e32 v15, s15 -; NOOPT-NEXT: s_mov_b64 s[0:1], exec -; NOOPT-NEXT: v_writelane_b32 v32, s0, 5 -; NOOPT-NEXT: v_writelane_b32 v32, s1, 6 -; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 -; NOOPT-NEXT: buffer_store_dword v32, off, s[28:31], 0 ; 4-byte Folded Spill -; NOOPT-NEXT: s_mov_b64 exec, s[26:27] ; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:4 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 offset:8 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:12 ; 4-byte Folded Spill @@ -5885,6 +5879,12 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: buffer_store_dword v13, off, s[28:31], 0 offset:56 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:60 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[0:1], exec +; NOOPT-NEXT: v_writelane_b32 v32, s0, 5 +; NOOPT-NEXT: v_writelane_b32 v32, s1, 6 +; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 +; NOOPT-NEXT: buffer_store_dword v32, off, s[28:31], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[26:27] ; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 ; NOOPT-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 ; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 @@ -5903,19 +5903,12 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: buffer_load_dword v6, off, s[28:31], 0 offset:28 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v7, off, s[28:31], 0 offset:32 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v8, off, s[28:31], 0 offset:36 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt expcnt(6) ; NOOPT-NEXT: buffer_load_dword v9, off, s[28:31], 0 offset:40 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt expcnt(5) ; NOOPT-NEXT: buffer_load_dword v10, off, s[28:31], 0 offset:44 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt expcnt(4) ; NOOPT-NEXT: buffer_load_dword v11, off, s[28:31], 0 offset:48 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt expcnt(3) ; NOOPT-NEXT: buffer_load_dword v12, off, s[28:31], 0 offset:52 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt expcnt(2) ; NOOPT-NEXT: buffer_load_dword v13, off, s[28:31], 0 offset:56 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt expcnt(1) ; NOOPT-NEXT: buffer_load_dword v14, off, s[28:31], 0 offset:60 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v15, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v16, off, s[28:31], 0 offset:72 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:80 ; 4-byte Folded Reload @@ -9004,27 +8997,26 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { ; GENERIC-LABEL: broken_phi_bb: ; GENERIC: ; %bb.0: ; %bb ; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GENERIC-NEXT: s_mov_b32 s6, 8 +; GENERIC-NEXT: s_mov_b32 s4, 8 ; GENERIC-NEXT: s_mov_b32 s3, 0xf000 ; GENERIC-NEXT: s_mov_b32 s2, -1 ; GENERIC-NEXT: s_branch .LBB26_2 ; GENERIC-NEXT: .LBB26_1: ; %Flow ; GENERIC-NEXT: ; in Loop: Header=BB26_2 Depth=1 ; GENERIC-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GENERIC-NEXT: s_mov_b32 s4, s1 ; GENERIC-NEXT: s_cbranch_vccz .LBB26_4 ; GENERIC-NEXT: .LBB26_2: ; %bb2 ; GENERIC-NEXT: ; =>This Inner Loop Header: Depth=1 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: s_cmp_ge_i32 s6, s0 +; GENERIC-NEXT: s_cmp_ge_i32 s4, s0 ; GENERIC-NEXT: s_mov_b64 s[4:5], -1 -; GENERIC-NEXT: ; implicit-def: $sgpr6 ; GENERIC-NEXT: s_cbranch_scc1 .LBB26_1 ; GENERIC-NEXT: ; %bb.3: ; %bb4 ; GENERIC-NEXT: ; in Loop: Header=BB26_2 Depth=1 ; GENERIC-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc ; GENERIC-NEXT: s_waitcnt vmcnt(0) ; GENERIC-NEXT: s_mov_b64 s[4:5], 0 -; GENERIC-NEXT: s_mov_b32 s6, s1 ; GENERIC-NEXT: s_branch .LBB26_1 ; GENERIC-NEXT: .LBB26_4: ; %bb8 ; GENERIC-NEXT: s_endpgm @@ -9065,8 +9057,8 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_cmp_ge_i32_e64 s[2:3], v0, s2 ; NOOPT-NEXT: v_mov_b32_e32 v0, s4 -; NOOPT-NEXT: s_and_b64 vcc, exec, s[2:3] ; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: s_and_b64 vcc, exec, s[2:3] ; NOOPT-NEXT: v_writelane_b32 v18, s0, 2 ; NOOPT-NEXT: v_writelane_b32 v18, s1, 3 ; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 @@ -9112,30 +9104,30 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { ; NOOPT-NEXT: v_mov_b32_e32 v13, s17 ; NOOPT-NEXT: v_mov_b32_e32 v14, s18 ; NOOPT-NEXT: v_mov_b32_e32 v15, s19 -; NOOPT-NEXT: v_mov_b32_e32 v16, s0 -; NOOPT-NEXT: buffer_store_dword v16, off, s[24:27], 0 offset:76 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[24:27], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[24:27], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[24:27], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[24:27], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[24:27], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[24:27], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[24:27], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[24:27], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[24:27], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[24:27], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[24:27], 0 offset:68 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[24:27], 0 offset:72 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[24:27], 0 offset:76 ; 4-byte Folded Spill +; NOOPT-NEXT: v_mov_b32_e32 v0, s0 +; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:12 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 s[0:1], exec ; NOOPT-NEXT: v_writelane_b32 v18, s0, 4 ; NOOPT-NEXT: v_writelane_b32 v18, s1, 5 ; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 ; NOOPT-NEXT: buffer_store_dword v18, off, s[24:27], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[20:21] -; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:12 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:16 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v2, off, s[24:27], 0 offset:20 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:24 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v4, off, s[24:27], 0 offset:28 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v5, off, s[24:27], 0 offset:32 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v6, off, s[24:27], 0 offset:36 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v7, off, s[24:27], 0 offset:40 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v8, off, s[24:27], 0 offset:44 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v9, off, s[24:27], 0 offset:48 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v10, off, s[24:27], 0 offset:52 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v11, off, s[24:27], 0 offset:56 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v12, off, s[24:27], 0 offset:60 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v13, off, s[24:27], 0 offset:64 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v14, off, s[24:27], 0 offset:68 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v15, off, s[24:27], 0 offset:72 ; 4-byte Folded Spill ; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 ; NOOPT-NEXT: .LBB26_3: ; Parent Loop BB26_1 Depth=1 ; NOOPT-NEXT: ; => This Inner Loop Header: Depth=2 @@ -9146,30 +9138,23 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_readlane_b32 s0, v18, 6 ; NOOPT-NEXT: v_readlane_b32 s1, v18, 7 -; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:12 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v1, off, s[24:27], 0 offset:16 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v2, off, s[24:27], 0 offset:20 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v3, off, s[24:27], 0 offset:24 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v4, off, s[24:27], 0 offset:28 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v5, off, s[24:27], 0 offset:32 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v6, off, s[24:27], 0 offset:36 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v7, off, s[24:27], 0 offset:40 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v8, off, s[24:27], 0 offset:44 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt expcnt(6) -; NOOPT-NEXT: buffer_load_dword v9, off, s[24:27], 0 offset:48 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt expcnt(5) -; NOOPT-NEXT: buffer_load_dword v10, off, s[24:27], 0 offset:52 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt expcnt(4) -; NOOPT-NEXT: buffer_load_dword v11, off, s[24:27], 0 offset:56 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt expcnt(3) -; NOOPT-NEXT: buffer_load_dword v12, off, s[24:27], 0 offset:60 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt expcnt(2) -; NOOPT-NEXT: buffer_load_dword v13, off, s[24:27], 0 offset:64 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt expcnt(1) -; NOOPT-NEXT: buffer_load_dword v14, off, s[24:27], 0 offset:68 ; 4-byte Folded Reload -; NOOPT-NEXT: s_waitcnt expcnt(0) -; NOOPT-NEXT: buffer_load_dword v15, off, s[24:27], 0 offset:72 ; 4-byte Folded Reload -; NOOPT-NEXT: buffer_load_dword v16, off, s[24:27], 0 offset:76 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:16 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v1, off, s[24:27], 0 offset:20 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[24:27], 0 offset:24 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[24:27], 0 offset:28 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[24:27], 0 offset:32 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[24:27], 0 offset:36 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[24:27], 0 offset:40 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[24:27], 0 offset:44 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[24:27], 0 offset:48 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[24:27], 0 offset:52 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v10, off, s[24:27], 0 offset:56 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v11, off, s[24:27], 0 offset:60 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v12, off, s[24:27], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v13, off, s[24:27], 0 offset:68 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v14, off, s[24:27], 0 offset:72 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[24:27], 0 offset:76 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[24:27], 0 offset:12 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v17, off, s[24:27], 0 offset:80 ; 4-byte Folded Reload ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_readfirstlane_b32 s2, v17 @@ -9193,22 +9178,22 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { ; NOOPT-NEXT: buffer_store_dword v13, off, s[24:27], 0 offset:136 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v14, off, s[24:27], 0 offset:140 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v15, off, s[24:27], 0 offset:144 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:12 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:16 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v2, off, s[24:27], 0 offset:20 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:24 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v4, off, s[24:27], 0 offset:28 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v5, off, s[24:27], 0 offset:32 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v6, off, s[24:27], 0 offset:36 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v7, off, s[24:27], 0 offset:40 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v8, off, s[24:27], 0 offset:44 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v9, off, s[24:27], 0 offset:48 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v10, off, s[24:27], 0 offset:52 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v11, off, s[24:27], 0 offset:56 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v12, off, s[24:27], 0 offset:60 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v13, off, s[24:27], 0 offset:64 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v14, off, s[24:27], 0 offset:68 ; 4-byte Folded Spill -; NOOPT-NEXT: buffer_store_dword v15, off, s[24:27], 0 offset:72 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[24:27], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[24:27], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[24:27], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[24:27], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[24:27], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[24:27], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[24:27], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[24:27], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[24:27], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[24:27], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[24:27], 0 offset:68 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[24:27], 0 offset:72 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[24:27], 0 offset:76 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] ; NOOPT-NEXT: v_writelane_b32 v18, s2, 6 ; NOOPT-NEXT: v_writelane_b32 v18, s3, 7 @@ -9246,9 +9231,9 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { ; NOOPT-NEXT: buffer_load_dword v13, off, s[24:27], 0 offset:136 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v14, off, s[24:27], 0 offset:140 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v15, off, s[24:27], 0 offset:144 ; 4-byte Folded Reload -; NOOPT-NEXT: s_mov_b64 s[0:1], 0 ; NOOPT-NEXT: s_waitcnt vmcnt(14) ; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[0:1], 0 ; NOOPT-NEXT: v_writelane_b32 v18, s0, 2 ; NOOPT-NEXT: v_writelane_b32 v18, s1, 3 ; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll index 60f77bda6d50e..81771ffb7892f 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll @@ -28,29 +28,29 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1 ; GCN-NEXT: ; %bb.1: ; %atomic ; GCN-NEXT: s_mov_b32 s8, s10 ; GCN-NEXT: s_mov_b32 s9, s10 -; GCN-NEXT: buffer_load_dword v4, v[1:2], s[8:11], 0 addr64 offset:400 +; GCN-NEXT: buffer_load_dword v3, v[1:2], s[8:11], 0 addr64 offset:400 ; GCN-NEXT: s_load_dword s4, s[4:5], 0xf ; GCN-NEXT: s_mov_b64 s[2:3], 0 ; GCN-NEXT: .LBB0_2: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_max_i32_e32 v3, s4, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v6, v4 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v5, v3 -; GCN-NEXT: buffer_atomic_cmpswap v[5:6], v[1:2], s[8:11], 0 addr64 offset:400 glc +; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_max_i32_e32 v4, s4, v5 +; GCN-NEXT: v_mov_b32_e32 v3, v4 +; GCN-NEXT: v_mov_b32_e32 v4, v5 +; GCN-NEXT: buffer_atomic_cmpswap v[3:4], v[1:2], s[8:11], 0 addr64 offset:400 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GCN-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN-NEXT: v_mov_b32_e32 v4, v5 ; GCN-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN-NEXT: s_cbranch_execnz .LBB0_2 ; GCN-NEXT: ; %bb.3: ; %atomicrmw.end ; GCN-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], 0 ; GCN-NEXT: .LBB0_4: ; %exit ; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -87,22 +87,22 @@ define amdgpu_kernel void @atomic_max_i32_noret(ptr addrspace(1) %out, ptr addrs ; GCN-NEXT: ; %bb.1: ; %atomic ; GCN-NEXT: s_mov_b32 s0, s2 ; GCN-NEXT: s_mov_b32 s1, s2 -; GCN-NEXT: buffer_load_dword v4, v[1:2], s[0:3], 0 addr64 offset:400 +; GCN-NEXT: buffer_load_dword v3, v[1:2], s[0:3], 0 addr64 offset:400 ; GCN-NEXT: s_load_dword s6, s[4:5], 0xf ; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: .LBB1_2: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_max_i32_e32 v3, s6, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v6, v4 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v5, v3 -; GCN-NEXT: buffer_atomic_cmpswap v[5:6], v[1:2], s[0:3], 0 addr64 offset:400 glc +; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_max_i32_e32 v4, s6, v5 +; GCN-NEXT: v_mov_b32_e32 v3, v4 +; GCN-NEXT: v_mov_b32_e32 v4, v5 +; GCN-NEXT: buffer_atomic_cmpswap v[3:4], v[1:2], s[0:3], 0 addr64 offset:400 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v4, v5 ; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN-NEXT: s_cbranch_execnz .LBB1_2 ; GCN-NEXT: .LBB1_3: ; %exit diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index 7e3d5c97391e1..cf32fd5934009 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -2888,32 +2888,32 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; EG-NEXT: Fetch clause starting at 12: ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 14: -; EG-NEXT: OR_INT T0.W, KC0[2].W, KC0[3].X, -; EG-NEXT: MOV * T1.W, literal.x, +; EG-NEXT: OR_INT T1.W, KC0[2].W, KC0[3].X, +; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) -; EG-NEXT: SETNE_INT * T0.W, PV.W, 0.0, +; EG-NEXT: SETNE_INT * T1.W, PV.W, 0.0, ; EG-NEXT: PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0, ; EG-NEXT: ALU clause starting at 19: -; EG-NEXT: MOV T0.W, KC0[2].W, -; EG-NEXT: MOV * T1.W, KC0[3].Z, +; EG-NEXT: MOV T1.W, KC0[2].W, +; EG-NEXT: MOV * T0.W, KC0[3].Z, ; EG-NEXT: MOV T2.W, KC0[3].Y, ; EG-NEXT: MULLO_INT * T0.X, PV.W, PS, -; EG-NEXT: MOV T1.W, KC0[3].X, -; EG-NEXT: MULHI * T0.Y, T0.W, PV.W, +; EG-NEXT: MOV T0.W, KC0[3].X, +; EG-NEXT: MULHI * T0.Y, T1.W, PV.W, ; EG-NEXT: ADD_INT T3.W, PS, T0.X, ; EG-NEXT: MULLO_INT * T0.X, PV.W, T2.W, ; EG-NEXT: ADD_INT T0.Y, PV.W, PS, -; EG-NEXT: MOV T1.W, literal.x, -; EG-NEXT: MULLO_INT * T0.X, T0.W, T2.W, +; EG-NEXT: MOV T0.W, literal.x, +; EG-NEXT: MULLO_INT * T0.X, T1.W, T2.W, ; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) ; EG-NEXT: ALU clause starting at 31: -; EG-NEXT: MOV T0.W, KC0[2].Y, -; EG-NEXT: SETE_INT * T1.W, T1.W, 0.0, +; EG-NEXT: MOV T1.W, KC0[2].Y, +; EG-NEXT: SETE_INT * T0.W, T0.W, 0.0, ; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0, ; EG-NEXT: ALU clause starting at 34: ; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 35: -; EG-NEXT: LSHR * T1.X, T0.W, literal.x, +; EG-NEXT: LSHR * T1.X, T1.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = icmp eq i64 %a, 0 diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index 8fe68ba748971..77e64ab53716a 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -240,27 +240,31 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0: ; %bb.0: ; %_udiv-special-cases ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v1 +; GFX9-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v2 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec @@ -270,34 +274,34 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v12 -; GFX9-O0-NEXT: v_ashrrev_i64 v[2:3], s4, v[2:3] -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 +; GFX9-O0-NEXT: v_ashrrev_i64 v[3:4], s4, v[3:4] +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v12 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v12 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-O0-NEXT: ; implicit-def: $vgpr30 : SGPR spill to VGPR lane -; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 0 -; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 1 +; GFX9-O0-NEXT: ; implicit-def: $vgpr29 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 0 +; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 1 ; GFX9-O0-NEXT: s_mov_b32 s10, s6 -; GFX9-O0-NEXT: v_writelane_b32 v30, s10, 2 +; GFX9-O0-NEXT: v_writelane_b32 v29, s10, 2 ; GFX9-O0-NEXT: s_mov_b32 s11, s7 -; GFX9-O0-NEXT: v_writelane_b32 v30, s11, 3 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v9, vcc, s10, v2 +; GFX9-O0-NEXT: v_writelane_b32 v29, s11, 3 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v9, vcc, s10, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v4, v3, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v4, v1, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s10 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v0, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v2, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v6, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v1, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v3, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec @@ -305,25 +309,25 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[11:12], s[4:5] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[4:5] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[4:5] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[4:5] ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v17 @@ -462,18 +466,18 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6 ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[4:5], s[8:9] -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2 ; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s13 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v3 ; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr16 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0 ; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s13 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v1 ; GFX9-O0-NEXT: v_min_u32_e64 v11, v4, v10 ; GFX9-O0-NEXT: ; implicit-def: $sgpr13 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s12 @@ -553,35 +557,35 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[12:13] ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 -; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 ; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9] ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 -; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec -; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 4 -; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 5 +; GFX9-O0-NEXT: v_writelane_b32 v29, s4, 4 +; GFX9-O0-NEXT: v_writelane_b32 v29, s5, 5 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] @@ -589,67 +593,64 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_branch .LBB0_8 ; GFX9-O0-NEXT: .LBB0_1: ; %Flow ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 6 -; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7 +; GFX9-O0-NEXT: v_readlane_b32 s4, v29, 6 +; GFX9-O0-NEXT: v_readlane_b32 s5, v29, 7 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: ; %bb.2: ; %Flow -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_5 ; GFX9-O0-NEXT: .LBB0_3: ; %Flow2 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 4 -; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 5 +; GFX9-O0-NEXT: v_readlane_b32 s4, v29, 4 +; GFX9-O0-NEXT: v_readlane_b32 s5, v29, 5 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_9 ; GFX9-O0-NEXT: .LBB0_4: ; %udiv-loop-exit -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 1 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1] @@ -674,408 +675,408 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_3 ; GFX9-O0-NEXT: .LBB0_5: ; %Flow1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 8 -; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 9 +; GFX9-O0-NEXT: v_readlane_b32 s4, v29, 8 +; GFX9-O0-NEXT: v_readlane_b32 s5, v29, 9 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_4 ; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 10 -; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 11 -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_readlane_b32 s6, v29, 10 +; GFX9-O0-NEXT: v_readlane_b32 s7, v29, 11 +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: s_waitcnt vmcnt(16) -; GFX9-O0-NEXT: v_lshrrev_b64 v[28:29], s4, v[2:3] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v29 +; GFX9-O0-NEXT: s_waitcnt vmcnt(10) +; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], s4, v[2:3] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v7 ; GFX9-O0-NEXT: s_mov_b32 s5, 1 -; GFX9-O0-NEXT: v_lshlrev_b64 v[22:23], s5, v[22:23] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v23 -; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v28 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v22 -; GFX9-O0-NEXT: v_or_b32_e64 v22, v5, v10 -; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v23, v4 -; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[2:3] -; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], s4, v[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v29 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-O0-NEXT: v_lshlrev_b64 v[26:27], s5, v[26:27] +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v27 +; GFX9-O0-NEXT: v_or_b32_e64 v14, v14, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v26 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-O0-NEXT: v_lshlrev_b64 v[26:27], s5, v[2:3] +; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], s4, v[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v27 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v28 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_or_b32_e64 v4, v3, v4 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v26 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_or_b32_e64 v14, v3, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2 ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s5, v[0:1] -; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[6:7] +; GFX9-O0-NEXT: v_lshlrev_b64 v[26:27], s5, v[10:11] ; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v29 -; GFX9-O0-NEXT: s_waitcnt vmcnt(10) +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v27 -; GFX9-O0-NEXT: v_or3_b32 v6, v6, v7, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v28, v25 +; GFX9-O0-NEXT: v_or3_b32 v10, v10, v11, v28 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v28 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v26 -; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v26 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v24 +; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v11 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-O0-NEXT: s_waitcnt vmcnt(8) -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v25 -; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v17 +; GFX9-O0-NEXT: v_or_b32_e64 v10, v10, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v24 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v16 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v22 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v13, vcc, v13, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v10, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v4, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v7, vcc, v7, v10 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v16, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v14, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v11, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7 -; GFX9-O0-NEXT: v_ashrrev_i64 v[13:14], s4, v[11:12] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-O0-NEXT: v_ashrrev_i64 v[5:6], s4, v[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v6 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], 1 ; GFX9-O0-NEXT: s_mov_b32 s8, s5 -; GFX9-O0-NEXT: v_and_b32_e64 v12, v7, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v13 +; GFX9-O0-NEXT: v_and_b32_e64 v4, v15, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v5 ; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 -; GFX9-O0-NEXT: v_and_b32_e64 v14, v11, s4 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-O0-NEXT: v_and_b32_e64 v6, v17, s4 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v21 -; GFX9-O0-NEXT: v_and_b32_e64 v22, v7, v22 -; GFX9-O0-NEXT: v_and_b32_e64 v20, v11, v20 +; GFX9-O0-NEXT: v_and_b32_e64 v22, v15, v22 +; GFX9-O0-NEXT: v_and_b32_e64 v20, v17, v20 ; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v21, v22 ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v19 -; GFX9-O0-NEXT: v_and_b32_e64 v7, v7, v22 -; GFX9-O0-NEXT: v_and_b32_e64 v22, v11, v18 -; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v23, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v22 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v23 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v21 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v19 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v18, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v11, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v7, vcc +; GFX9-O0-NEXT: v_and_b32_e64 v15, v15, v22 +; GFX9-O0-NEXT: v_and_b32_e64 v17, v17, v18 +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v17 +; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 killed $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v21 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v10, vcc, v10, v19 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v16, vcc, v16, v18, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v14, vcc, v14, v17, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v15, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v11 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v16 +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 ; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 ; GFX9-O0-NEXT: s_mov_b32 s5, s8 ; GFX9-O0-NEXT: s_mov_b32 s4, s9 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, s5 -; GFX9-O0-NEXT: v_add_co_u32_e32 v19, vcc, v11, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s4 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v11, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v16, vcc, v10, v11, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v10, s4 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v10, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v11, vcc, v11, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v12, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v12, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v10, vcc, v10, v12, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v9 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v19 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v21, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v20 -; GFX9-O0-NEXT: v_or_b32_e64 v18, v18, v21 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v19 -; GFX9-O0-NEXT: v_or_b32_e64 v16, v16, v17 -; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v18 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[16:17], v[12:13] +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v8 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v12 +; GFX9-O0-NEXT: v_or_b32_e64 v10, v10, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11 +; GFX9-O0-NEXT: v_or_b32_e64 v8, v8, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[8:9], v[4:5] ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v2 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v0 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v14 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v12 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 6 -; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 7 -; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 10 -; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 11 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 6 +; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 7 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 10 +; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 11 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execnz .LBB0_6 ; GFX9-O0-NEXT: s_branch .LBB0_1 ; GFX9-O0-NEXT: .LBB0_7: ; %udiv-preheader ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(9) -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[18:19] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], v8, v[18:19] +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v15 ; GFX9-O0-NEXT: s_mov_b32 s4, 64 -; GFX9-O0-NEXT: v_sub_u32_e64 v20, s4, v4 -; GFX9-O0-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15] +; GFX9-O0-NEXT: v_sub_u32_e64 v20, s4, v8 +; GFX9-O0-NEXT: v_lshlrev_b64 v[20:21], v20, v[16:17] ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v21 -; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v22 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v20 -; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v7 -; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[6:7], v4, s4 -; GFX9-O0-NEXT: v_sub_u32_e64 v5, v4, s4 -; GFX9-O0-NEXT: v_lshrrev_b64 v[20:21], v5, v[14:15] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v21 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v22, s[6:7] +; GFX9-O0-NEXT: v_or_b32_e64 v11, v11, v22 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v20 +; GFX9-O0-NEXT: v_or_b32_e64 v14, v14, v15 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v15 +; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[6:7], v8, s4 +; GFX9-O0-NEXT: v_sub_u32_e64 v11, v8, s4 +; GFX9-O0-NEXT: v_lshrrev_b64 v[20:21], v11, v[16:17] +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v21 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v11, v11, v22, s[6:7] ; GFX9-O0-NEXT: s_mov_b32 s4, 0 -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, s4 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, s4 ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v19 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v22, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v20 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v18 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v11, v11, v22, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v20 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v18 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v4, v[14:15] -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v11 +; GFX9-O0-NEXT: v_lshrrev_b64 v[16:17], v8, v[16:17] +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v17 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-O0-NEXT: s_mov_b32 s8, s5 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 killed $vgpr16_vgpr17 killed $exec ; GFX9-O0-NEXT: s_mov_b32 s8, s4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v16, v11, v16, s[6:7] ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v13 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8 +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v10 ; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 ; GFX9-O0-NEXT: s_mov_b32 s7, s8 ; GFX9-O0-NEXT: s_mov_b32 s6, s9 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, s7 -; GFX9-O0-NEXT: v_add_co_u32_e32 v16, vcc, v15, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, s6 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v12, vcc, v12, v15, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v18, vcc, v14, v15, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v13, vcc, v13, v14, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s7 +; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v12, vcc, v11, v12, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s7 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v10, vcc, v10, v11, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v11, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v13 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, s5 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, s5 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, s4 -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 10 -; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 11 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, s5 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4 +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_writelane_b32 v29, s4, 10 +; GFX9-O0-NEXT: v_writelane_b32 v29, s5, 11 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_6 ; GFX9-O0-NEXT: .LBB0_8: ; %udiv-bb1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload @@ -1088,118 +1089,118 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 ; GFX9-O0-NEXT: s_mov_b32 s8, s6 ; GFX9-O0-NEXT: s_mov_b32 s9, s7 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v3, v4 +; GFX9-O0-NEXT: v_add_co_u32_e32 v7, vcc, v3, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v4, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9-O0-NEXT: v_addc_co_u32_e32 v4, vcc, v2, v4, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v2, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v8 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f ; GFX9-O0-NEXT: v_sub_u32_e64 v2, s4, v3 -; GFX9-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[10:11] -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 +; GFX9-O0-NEXT: v_lshlrev_b64 v[0:1], v2, v[11:12] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 ; GFX9-O0-NEXT: s_mov_b32 s4, 64 ; GFX9-O0-NEXT: v_sub_u32_e64 v13, s4, v2 -; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], v13, v[6:7] +; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], v13, v[9:10] ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14 -; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v15 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13 -; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v15 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 ; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v2, s4 ; GFX9-O0-NEXT: s_mov_b32 s10, 63 ; GFX9-O0-NEXT: v_sub_u32_e64 v3, s10, v3 -; GFX9-O0-NEXT: v_lshlrev_b64 v[12:13], v3, v[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[4:5] +; GFX9-O0-NEXT: v_lshlrev_b64 v[13:14], v3, v[9:10] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v14 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[4:5] ; GFX9-O0-NEXT: s_mov_b32 s10, 0 ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v2, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[10:11] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v12 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v13 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[10:11] ; GFX9-O0-NEXT: ; implicit-def: $sgpr10 ; GFX9-O0-NEXT: ; implicit-def: $sgpr10 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-O0-NEXT: v_lshlrev_b64 v[6:7], v2, v[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], v2, v[9:10] +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[4:5] -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s8 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v3, v6, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 -; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v8 -; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[0:1], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v7 +; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[4:5], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 8 -; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 9 +; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 8 +; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 9 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 -; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execz .LBB0_5 @@ -1215,10 +1216,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 32 @@ -1492,7 +1493,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4] ; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -1706,7 +1707,7 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0: ; %bb.0: ; %_udiv-special-cases ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill @@ -1783,16 +1784,16 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-O0-NEXT: ; implicit-def: $vgpr30 : SGPR spill to VGPR lane -; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 0 -; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 1 +; GFX9-O0-NEXT: ; implicit-def: $vgpr29 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 0 +; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 1 ; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[0:1], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 -; GFX9-O0-NEXT: v_or_b32_e64 v14, v3, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-O0-NEXT: v_or_b32_e64 v8, v2, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v9 +; GFX9-O0-NEXT: v_or_b32_e64 v14, v1, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-O0-NEXT: v_or_b32_e64 v8, v0, v2 ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v14 ; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7] @@ -1837,18 +1838,18 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2 ; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v3 ; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0 ; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v1 ; GFX9-O0-NEXT: v_min_u32_e64 v14, v4, v10 ; GFX9-O0-NEXT: ; implicit-def: $sgpr9 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 @@ -1932,35 +1933,35 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[12:13] ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 -; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 ; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9] ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 -; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec -; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 2 -; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 3 +; GFX9-O0-NEXT: v_writelane_b32 v29, s4, 2 +; GFX9-O0-NEXT: v_writelane_b32 v29, s5, 3 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] @@ -1968,50 +1969,47 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_branch .LBB1_8 ; GFX9-O0-NEXT: .LBB1_1: ; %Flow ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 4 -; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 5 +; GFX9-O0-NEXT: v_readlane_b32 s4, v29, 4 +; GFX9-O0-NEXT: v_readlane_b32 s5, v29, 5 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: ; %bb.2: ; %Flow -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) +; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) +; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(7) +; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_5 ; GFX9-O0-NEXT: .LBB1_3: ; %Flow2 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 2 -; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 3 +; GFX9-O0-NEXT: v_readlane_b32 s4, v29, 2 +; GFX9-O0-NEXT: v_readlane_b32 s5, v29, 3 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) @@ -2053,29 +2051,29 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_3 ; GFX9-O0-NEXT: .LBB1_5: ; %Flow1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 6 -; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7 +; GFX9-O0-NEXT: v_readlane_b32 s4, v29, 6 +; GFX9-O0-NEXT: v_readlane_b32 s5, v29, 7 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) @@ -2093,214 +2091,214 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: .LBB1_6: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 8 -; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 9 -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_readlane_b32 s6, v29, 8 +; GFX9-O0-NEXT: v_readlane_b32 s7, v29, 9 +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: s_waitcnt vmcnt(16) -; GFX9-O0-NEXT: v_lshrrev_b64 v[28:29], s4, v[2:3] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v29 +; GFX9-O0-NEXT: s_waitcnt vmcnt(10) +; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], s4, v[2:3] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v7 ; GFX9-O0-NEXT: s_mov_b32 s5, 1 -; GFX9-O0-NEXT: v_lshlrev_b64 v[22:23], s5, v[22:23] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v23 -; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v28 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v22 -; GFX9-O0-NEXT: v_or_b32_e64 v22, v5, v10 -; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v23, v4 -; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[2:3] -; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], s4, v[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v29 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-O0-NEXT: v_lshlrev_b64 v[26:27], s5, v[26:27] +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v27 +; GFX9-O0-NEXT: v_or_b32_e64 v14, v14, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v26 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-O0-NEXT: v_lshlrev_b64 v[26:27], s5, v[2:3] +; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], s4, v[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v27 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v28 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_or_b32_e64 v4, v3, v4 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v26 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_or_b32_e64 v14, v3, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2 ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s5, v[0:1] -; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[6:7] +; GFX9-O0-NEXT: v_lshlrev_b64 v[26:27], s5, v[10:11] ; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v29 -; GFX9-O0-NEXT: s_waitcnt vmcnt(10) +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v27 -; GFX9-O0-NEXT: v_or3_b32 v6, v6, v7, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v28, v25 +; GFX9-O0-NEXT: v_or3_b32 v10, v10, v11, v28 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v28 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v26 -; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v26 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v24 +; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v11 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-O0-NEXT: s_waitcnt vmcnt(8) -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v25 -; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v17 +; GFX9-O0-NEXT: v_or_b32_e64 v10, v10, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v24 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v16 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v22 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v13, vcc, v13, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v10, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v4, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v7, vcc, v7, v10 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v16, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v14, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v11, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7 -; GFX9-O0-NEXT: v_ashrrev_i64 v[13:14], s4, v[11:12] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-O0-NEXT: v_ashrrev_i64 v[5:6], s4, v[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v6 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], 1 ; GFX9-O0-NEXT: s_mov_b32 s8, s5 -; GFX9-O0-NEXT: v_and_b32_e64 v12, v7, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v13 +; GFX9-O0-NEXT: v_and_b32_e64 v4, v15, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v5 ; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 -; GFX9-O0-NEXT: v_and_b32_e64 v14, v11, s4 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-O0-NEXT: v_and_b32_e64 v6, v17, s4 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v21 -; GFX9-O0-NEXT: v_and_b32_e64 v22, v7, v22 -; GFX9-O0-NEXT: v_and_b32_e64 v20, v11, v20 +; GFX9-O0-NEXT: v_and_b32_e64 v22, v15, v22 +; GFX9-O0-NEXT: v_and_b32_e64 v20, v17, v20 ; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v21, v22 ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v19 -; GFX9-O0-NEXT: v_and_b32_e64 v7, v7, v22 -; GFX9-O0-NEXT: v_and_b32_e64 v22, v11, v18 -; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v23, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v22 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v23 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v21 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v19 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v18, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v11, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v7, vcc +; GFX9-O0-NEXT: v_and_b32_e64 v15, v15, v22 +; GFX9-O0-NEXT: v_and_b32_e64 v17, v17, v18 +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v17 +; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 killed $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v21 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v10, vcc, v10, v19 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v16, vcc, v16, v18, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v14, vcc, v14, v17, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v15, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v11 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v16 +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 ; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 ; GFX9-O0-NEXT: s_mov_b32 s5, s8 ; GFX9-O0-NEXT: s_mov_b32 s4, s9 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, s5 -; GFX9-O0-NEXT: v_add_co_u32_e32 v19, vcc, v11, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s4 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v11, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v16, vcc, v10, v11, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v10, s4 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v10, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-O0-NEXT: v_add_co_u32_e32 v11, vcc, v11, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v12, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v12, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v10, vcc, v10, v12, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v9 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v19 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v21, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v20 -; GFX9-O0-NEXT: v_or_b32_e64 v18, v18, v21 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v19 -; GFX9-O0-NEXT: v_or_b32_e64 v16, v16, v17 -; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v18 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[16:17], v[12:13] +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v8 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v12 +; GFX9-O0-NEXT: v_or_b32_e64 v10, v10, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11 +; GFX9-O0-NEXT: v_or_b32_e64 v8, v8, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[8:9], v[4:5] ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v2 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v0 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v14 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v12 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 4 -; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 5 -; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 8 -; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 9 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 4 +; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 5 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 8 +; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 9 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill @@ -2318,128 +2316,128 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_branch .LBB1_1 ; GFX9-O0-NEXT: .LBB1_7: ; %udiv-preheader ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(9) -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[18:19] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], v8, v[18:19] +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v15 ; GFX9-O0-NEXT: s_mov_b32 s4, 64 -; GFX9-O0-NEXT: v_sub_u32_e64 v20, s4, v4 -; GFX9-O0-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15] +; GFX9-O0-NEXT: v_sub_u32_e64 v20, s4, v8 +; GFX9-O0-NEXT: v_lshlrev_b64 v[20:21], v20, v[16:17] ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v21 -; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v22 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v20 -; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v7 -; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[6:7], v4, s4 -; GFX9-O0-NEXT: v_sub_u32_e64 v5, v4, s4 -; GFX9-O0-NEXT: v_lshrrev_b64 v[20:21], v5, v[14:15] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v21 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v22, s[6:7] +; GFX9-O0-NEXT: v_or_b32_e64 v11, v11, v22 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v20 +; GFX9-O0-NEXT: v_or_b32_e64 v14, v14, v15 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v15 +; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[6:7], v8, s4 +; GFX9-O0-NEXT: v_sub_u32_e64 v11, v8, s4 +; GFX9-O0-NEXT: v_lshrrev_b64 v[20:21], v11, v[16:17] +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v21 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v11, v11, v22, s[6:7] ; GFX9-O0-NEXT: s_mov_b32 s4, 0 -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, s4 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, s4 ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v19 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v22, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v20 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v18 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v11, v11, v22, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v20 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v18 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v4, v[14:15] -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v11 +; GFX9-O0-NEXT: v_lshrrev_b64 v[16:17], v8, v[16:17] +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v17 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-O0-NEXT: s_mov_b32 s8, s5 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 killed $vgpr16_vgpr17 killed $exec ; GFX9-O0-NEXT: s_mov_b32 s8, s4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v16, v11, v16, s[6:7] ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v13 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8 +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v10 ; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1 ; GFX9-O0-NEXT: s_mov_b32 s7, s8 ; GFX9-O0-NEXT: s_mov_b32 s6, s9 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, s7 -; GFX9-O0-NEXT: v_add_co_u32_e32 v16, vcc, v15, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, s6 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v12, vcc, v12, v15, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v18, vcc, v14, v15, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v13, vcc, v13, v14, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s7 +; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v12, vcc, v11, v12, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s7 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v10, vcc, v10, v11, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v11, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v13 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, s5 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, s5 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, s4 -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 8 -; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 9 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, s5 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4 +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_writelane_b32 v29, s4, 8 +; GFX9-O0-NEXT: v_writelane_b32 v29, s5, 9 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill @@ -2449,12 +2447,12 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_branch .LBB1_6 ; GFX9-O0-NEXT: .LBB1_8: ; %udiv-bb1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload @@ -2467,118 +2465,118 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 ; GFX9-O0-NEXT: s_mov_b32 s8, s6 ; GFX9-O0-NEXT: s_mov_b32 s9, s7 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v3, v4 +; GFX9-O0-NEXT: v_add_co_u32_e32 v7, vcc, v3, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v4, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9-O0-NEXT: v_addc_co_u32_e32 v4, vcc, v2, v4, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v2, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v8 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f ; GFX9-O0-NEXT: v_sub_u32_e64 v2, s4, v3 -; GFX9-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[10:11] -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 +; GFX9-O0-NEXT: v_lshlrev_b64 v[0:1], v2, v[11:12] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 ; GFX9-O0-NEXT: s_mov_b32 s4, 64 ; GFX9-O0-NEXT: v_sub_u32_e64 v13, s4, v2 -; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], v13, v[6:7] +; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], v13, v[9:10] ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14 -; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v15 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13 -; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v15 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 ; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v2, s4 ; GFX9-O0-NEXT: s_mov_b32 s10, 63 ; GFX9-O0-NEXT: v_sub_u32_e64 v3, s10, v3 -; GFX9-O0-NEXT: v_lshlrev_b64 v[12:13], v3, v[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[4:5] +; GFX9-O0-NEXT: v_lshlrev_b64 v[13:14], v3, v[9:10] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v14 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[4:5] ; GFX9-O0-NEXT: s_mov_b32 s10, 0 ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v2, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[10:11] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v12 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v13 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[10:11] ; GFX9-O0-NEXT: ; implicit-def: $sgpr10 ; GFX9-O0-NEXT: ; implicit-def: $sgpr10 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-O0-NEXT: v_lshlrev_b64 v[6:7], v2, v[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], v2, v[9:10] +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[4:5] -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s8 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v3, v6, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 -; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v8 -; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v1 -; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[0:1], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v7 +; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[4:5], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 6 -; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 7 +; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 6 +; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execz .LBB1_5 @@ -2835,7 +2833,7 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4] ; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index 4addf42b27984..2952633fac57e 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -369,12 +369,12 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 ; GCN-IR-NEXT: v_add_i32_e64 v2, s[6:7], 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 -; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3 +; GCN-IR-NEXT: v_min_u32_e32 v18, v2, v3 ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v6 ; GCN-IR-NEXT: v_add_i32_e64 v2, s[6:7], 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v7 -; GCN-IR-NEXT: v_min_u32_e32 v11, v2, v3 -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[6:7], v10, v11 +; GCN-IR-NEXT: v_min_u32_e32 v19, v2, v3 +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[6:7], v18, v19 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[6:7] ; GCN-IR-NEXT: v_subb_u32_e64 v3, s[6:7], 0, 0, s[6:7] @@ -399,47 +399,47 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[6:7], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_not_b32_e32 v4, v10 -; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[6:7], v8 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, v4, v11 -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], -1, 0, vcc -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 +; GCN-IR-NEXT: v_not_b32_e32 v4, v18 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[6:7], v8 +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, v4, v19 +; GCN-IR-NEXT: v_addc_u32_e64 v7, s[8:9], -1, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v16, v8 -; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v17, v9, vcc -; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v16, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v17, v11, vcc ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 -; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 -; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 -; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v18, 31, v4 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 -; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 -; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v18 +; GCN-IR-NEXT: v_and_b32_e32 v19, v18, v1 +; GCN-IR-NEXT: v_and_b32_e32 v18, v18, v0 +; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3 +; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, v10, v18 +; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 +; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, v11, v19, vcc +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB1_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB1_5: ; %Flow4 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v1 ; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v0 @@ -1420,9 +1420,9 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 -; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3 +; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3 ; GCN-IR-NEXT: s_movk_i32 s6, 0xffc5 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v10 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v8 ; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] @@ -1444,46 +1444,46 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_lshl_b64 v[2:3], 24, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB11_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_lshr_b64 v[8:9], 24, v6 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 58, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], 24, v6 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 58, v8 +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[8:9], 0, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: .LBB11_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v14, v8 -; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v15, v9, vcc -; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v14, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v15, v11, vcc ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 -; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 -; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 -; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v16, 31, v4 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 -; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 -; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v16 +; GCN-IR-NEXT: v_and_b32_e32 v17, v16, v1 +; GCN-IR-NEXT: v_and_b32_e32 v16, v16, v0 +; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3 +; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, v10, v16 +; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 +; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, v11, v17, vcc +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB11_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB11_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB11_5: ; %Flow4 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v1 ; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v0 @@ -1613,9 +1613,9 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 -; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3 +; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3 ; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v10 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v8 ; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] @@ -1631,54 +1631,54 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_cbranch_execz .LBB12_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[4:5], v2 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9] +; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB12_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v6 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[8:9], v6 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v8 +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[8:9], 0, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: .LBB12_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v14, v8 -; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v15, v9, vcc -; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v14, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v15, v11, vcc ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 -; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 -; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 -; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v16, 31, v4 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 -; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 -; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v16 +; GCN-IR-NEXT: v_and_b32_e32 v17, v16, v1 +; GCN-IR-NEXT: v_and_b32_e32 v16, v16, v0 +; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3 +; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, v10, v16 +; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 +; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, v11, v17, vcc +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB12_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB12_5: ; %Flow4 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v1 ; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v0 @@ -1715,8 +1715,8 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_ffbh_u32_e32 v0, v4 ; GCN-IR-NEXT: v_add_i32_e64 v0, s[4:5], 32, v0 ; GCN-IR-NEXT: v_ffbh_u32_e32 v1, v5 -; GCN-IR-NEXT: v_min_u32_e32 v8, v0, v1 -; GCN-IR-NEXT: v_sub_i32_e64 v0, s[4:5], 48, v8 +; GCN-IR-NEXT: v_min_u32_e32 v12, v0, v1 +; GCN-IR-NEXT: v_sub_i32_e64 v0, s[4:5], 48, v12 ; GCN-IR-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5] ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] ; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[0:1] @@ -1738,44 +1738,44 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], v0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB13_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_lshr_b64 v[6:7], v[4:5], v6 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 0xffffffcf, v8 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 -; GCN-IR-NEXT: v_addc_u32_e64 v5, s[4:5], 0, -1, vcc -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff +; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[4:5], v6 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 0xffffffcf, v12 +; GCN-IR-NEXT: v_addc_u32_e64 v5, s[8:9], 0, -1, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: s_movk_i32 s10, 0x7fff ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 ; GCN-IR-NEXT: .LBB13_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2 -; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s12, v6 -; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v2 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s10, v8 +; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 1, v4 -; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v8, 0x8000, v8 +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v2 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] -; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], v6, v8 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 -; GCN-IR-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v7, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v12 +; GCN-IR-NEXT: v_and_b32_e32 v12, 0x8000, v12 +; GCN-IR-NEXT: v_or_b32_e32 v1, v7, v1 +; GCN-IR-NEXT: v_or_b32_e32 v0, v6, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, v3 +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v8, v12 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v2 +; GCN-IR-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB13_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB13_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB13_5: ; %Flow4 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 ; GCN-IR-NEXT: v_or_b32_e32 v3, v3, v1 ; GCN-IR-NEXT: v_or_b32_e32 v2, v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index e64e3def98c26..d89daac3ecbcc 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -346,12 +346,12 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v2 ; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v3 -; GCN-IR-NEXT: v_min_u32_e32 v12, v4, v5 +; GCN-IR-NEXT: v_min_u32_e32 v10, v4, v5 ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0 ; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 -; GCN-IR-NEXT: v_min_u32_e32 v13, v4, v5 -; GCN-IR-NEXT: v_sub_i32_e64 v4, s[6:7], v12, v13 +; GCN-IR-NEXT: v_min_u32_e32 v11, v4, v5 +; GCN-IR-NEXT: v_sub_i32_e64 v4, s[6:7], v10, v11 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_subb_u32_e64 v5, s[6:7], 0, 0, s[6:7] @@ -375,47 +375,47 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, -1, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, -1, v3, vcc -; GCN-IR-NEXT: v_not_b32_e32 v6, v12 -; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v6, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 -; GCN-IR-NEXT: v_addc_u32_e64 v9, s[4:5], -1, 0, vcc -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 +; GCN-IR-NEXT: v_not_b32_e32 v6, v10 +; GCN-IR-NEXT: v_lshr_b64 v[12:13], v[0:1], v8 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v6, v11 +; GCN-IR-NEXT: v_addc_u32_e64 v9, s[8:9], -1, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; GCN-IR-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 -; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 -; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v16, v10 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v17, v11, vcc -; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 +; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v6 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v16, v12 +; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v17, v13, vcc ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8 -; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5 -; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 -; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v3 -; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v18, 31, v6 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v12 -; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v13, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 -; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v18 +; GCN-IR-NEXT: v_and_b32_e32 v19, v18, v3 +; GCN-IR-NEXT: v_and_b32_e32 v18, v18, v2 +; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5 +; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_sub_i32_e32 v12, vcc, v12, v18 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v6 +; GCN-IR-NEXT: v_subb_u32_e32 v13, vcc, v13, v19, vcc +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB1_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB1_5: ; %Flow4 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 ; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v5 ; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4 @@ -1538,9 +1538,9 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 -; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3 +; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3 ; GCN-IR-NEXT: s_movk_i32 s6, 0xffc5 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v10 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v8 ; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] @@ -1561,46 +1561,46 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_lshl_b64 v[2:3], 24, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB11_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_lshr_b64 v[8:9], 24, v6 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 58, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], 24, v6 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 58, v8 +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[8:9], 0, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: .LBB11_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 -; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc -; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v11, vcc ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 -; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 -; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 -; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v4 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 -; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 -; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v14 +; GCN-IR-NEXT: v_and_b32_e32 v15, v14, v1 +; GCN-IR-NEXT: v_and_b32_e32 v14, v14, v0 +; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3 +; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, v10, v14 +; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 +; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, v11, v15, vcc +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB11_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB11_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB11_5: ; %Flow4 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 ; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 @@ -1729,9 +1729,9 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 -; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3 +; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3 ; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v10 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v8 ; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] @@ -1746,54 +1746,54 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_cbranch_execz .LBB12_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[4:5], v2 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9] +; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB12_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v6 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[8:9], v6 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v8 +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[8:9], 0, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: .LBB12_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 -; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc -; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v11, vcc ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 -; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 -; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 -; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v4 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 -; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 -; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v14 +; GCN-IR-NEXT: v_and_b32_e32 v15, v14, v1 +; GCN-IR-NEXT: v_and_b32_e32 v14, v14, v0 +; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3 +; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, v10, v14 +; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 +; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, v11, v15, vcc +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB12_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB12_5: ; %Flow4 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 ; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 @@ -1836,8 +1836,8 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 ; GCN-IR-NEXT: v_add_i32_e64 v2, s[4:5], 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 -; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3 -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 48, v10 +; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3 +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 48, v8 ; GCN-IR-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5] ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[2:3] @@ -1859,44 +1859,44 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB13_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v6 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffcf, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], 0, -1, vcc -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff +; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v6 +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffcf, v8 +; GCN-IR-NEXT: v_addc_u32_e64 v7, s[8:9], 0, -1, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: s_movk_i32 s10, 0x7fff ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: .LBB13_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s12, v8 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s10, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v11, vcc ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 -; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 -; GCN-IR-NEXT: v_and_b32_e32 v10, 0x8000, v10 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v4 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 -; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 -; GCN-IR-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v9, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v14 +; GCN-IR-NEXT: v_and_b32_e32 v14, 0x8000, v14 +; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3 +; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, v10, v14 +; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 +; GCN-IR-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v11, vcc +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB13_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB13_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB13_5: ; %Flow4 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 ; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 diff --git a/llvm/test/CodeGen/AMDGPU/sub_i1.ll b/llvm/test/CodeGen/AMDGPU/sub_i1.ll index 08ca8482d8c40..f2340627b3238 100644 --- a/llvm/test/CodeGen/AMDGPU/sub_i1.ll +++ b/llvm/test/CodeGen/AMDGPU/sub_i1.ll @@ -111,25 +111,25 @@ define amdgpu_kernel void @sub_i1_cf(ptr addrspace(1) %out, ptr addrspace(1) %a, ; GFX9-LABEL: sub_i1_cf: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 ; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX9-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX9-NEXT: s_xor_b64 s[8:9], exec, s[8:9] ; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %else ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v0, s[8:9] glc +; GFX9-NEXT: global_load_ubyte v0, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 ; GFX9-NEXT: .LBB2_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_andn2_saveexec_b64 s[6:7], s[8:9] ; GFX9-NEXT: s_cbranch_execz .LBB2_4 ; GFX9-NEXT: ; %bb.3: ; %if ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_andn2_b64 s[2:3], s[4:5], exec @@ -139,7 +139,6 @@ define amdgpu_kernel void @sub_i1_cf(ptr addrspace(1) %out, ptr addrspace(1) %a, ; GFX9-NEXT: s_or_b64 s[4:5], s[2:3], s[4:5] ; GFX9-NEXT: .LBB2_4: ; %endif ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_xor_b64 s[2:3], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index bc9a3f2389e7e..4cce3d7f33175 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -341,47 +341,47 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v2 -; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v10 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v10 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v3, vcc ; GCN-IR-NEXT: v_not_b32_e32 v0, v14 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v15 -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], -1, 0, vcc -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 +; GCN-IR-NEXT: v_addc_u32_e64 v1, s[8:9], -1, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 -; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v6 -; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v12, v8 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v13, v9, vcc -; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v6 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v12, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v13, v11, vcc ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v10 -; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v3 -; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v6 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 -; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 -; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v6 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v14 +; GCN-IR-NEXT: v_and_b32_e32 v15, v14, v3 +; GCN-IR-NEXT: v_and_b32_e32 v14, v14, v2 +; GCN-IR-NEXT: v_or_b32_e32 v5, v9, v5 +; GCN-IR-NEXT: v_or_b32_e32 v4, v8, v4 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v7 +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, v10, v14 +; GCN-IR-NEXT: v_mov_b32_e32 v8, v6 +; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, v11, v15, vcc +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB1_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB1_5: ; %Flow4 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 ; GCN-IR-NEXT: v_or_b32_e32 v4, v7, v1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v6, v0 @@ -1065,8 +1065,8 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 -; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffd0, v10 +; GCN-IR-NEXT: v_min_u32_e32 v14, v2, v3 +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffd0, v14 ; GCN-IR-NEXT: v_addc_u32_e64 v7, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[6:7] @@ -1081,54 +1081,54 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_cbranch_execz .LBB9_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v6 -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v6 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v7, vcc -; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v6 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[4:5], v2 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9] +; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB9_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v8 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v14 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[8:9], v8 +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[8:9], 0, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 -; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc -; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v11, vcc ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 -; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 -; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 -; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v4 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 -; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 -; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v14 +; GCN-IR-NEXT: v_and_b32_e32 v15, v14, v1 +; GCN-IR-NEXT: v_and_b32_e32 v14, v14, v0 +; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3 +; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, v10, v14 +; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 +; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, v11, v15, vcc +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB9_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB9_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB9_5: ; %Flow4 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1 ; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0 @@ -1177,44 +1177,44 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB10_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_lshr_b64 v[6:7], v[0:1], v8 +; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v8 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 0xffffffcf, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 -; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], 0, -1, vcc -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff +; GCN-IR-NEXT: v_addc_u32_e64 v1, s[8:9], 0, -1, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: s_movk_i32 s10, 0x7fff ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: .LBB10_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s12, v6 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s10, v8 +; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v4 ; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v8, 0x8000, v8 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], v6, v8 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 -; GCN-IR-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v7, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 +; GCN-IR-NEXT: v_and_b32_e32 v10, 0x8000, v10 +; GCN-IR-NEXT: v_or_b32_e32 v3, v7, v3 +; GCN-IR-NEXT: v_or_b32_e32 v2, v6, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v7, v5 +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v8, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v4 +; GCN-IR-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB10_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB10_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB10_5: ; %Flow4 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1 ; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0 @@ -1377,43 +1377,43 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB12_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_lshr_b64 v[6:7], v[0:1], v8 +; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v8 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc4, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 -; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], 0, -1, vcc -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_addc_u32_e64 v1, s[8:9], 0, -1, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: .LBB12_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 23, v6 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 23, v8 +; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v4 ; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v8, 24, v8 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], v6, v8 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 -; GCN-IR-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v7, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 +; GCN-IR-NEXT: v_and_b32_e32 v10, 24, v10 +; GCN-IR-NEXT: v_or_b32_e32 v3, v7, v3 +; GCN-IR-NEXT: v_or_b32_e32 v2, v6, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v7, v5 +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v8, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v6, v4 +; GCN-IR-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB12_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB12_5: ; %Flow4 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1 ; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index 464dad83f47c9..832b91df0bb96 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -322,12 +322,12 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v2 ; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v3 -; GCN-IR-NEXT: v_min_u32_e32 v12, v4, v5 +; GCN-IR-NEXT: v_min_u32_e32 v10, v4, v5 ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0 ; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 -; GCN-IR-NEXT: v_min_u32_e32 v13, v4, v5 -; GCN-IR-NEXT: v_sub_i32_e64 v4, s[6:7], v12, v13 +; GCN-IR-NEXT: v_min_u32_e32 v11, v4, v5 +; GCN-IR-NEXT: v_sub_i32_e64 v4, s[6:7], v10, v11 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_subb_u32_e64 v5, s[6:7], 0, 0, s[6:7] @@ -350,47 +350,47 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v3, vcc -; GCN-IR-NEXT: v_not_b32_e32 v6, v12 -; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v6, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 -; GCN-IR-NEXT: v_addc_u32_e64 v9, s[4:5], -1, 0, vcc -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 +; GCN-IR-NEXT: v_not_b32_e32 v6, v10 +; GCN-IR-NEXT: v_lshr_b64 v[12:13], v[0:1], v8 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v6, v11 +; GCN-IR-NEXT: v_addc_u32_e64 v9, s[8:9], -1, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; GCN-IR-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 -; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 -; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v14, v10 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v15, v11, vcc -; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 +; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v6 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v14, v12 +; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v15, v13, vcc ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8 -; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5 -; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 -; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v3 -; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v16, 31, v6 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v12 -; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v13, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 -; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v16 +; GCN-IR-NEXT: v_and_b32_e32 v17, v16, v3 +; GCN-IR-NEXT: v_and_b32_e32 v16, v16, v2 +; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5 +; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_sub_i32_e32 v12, vcc, v12, v16 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v6 +; GCN-IR-NEXT: v_subb_u32_e32 v13, vcc, v13, v17, vcc +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB1_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB1_5: ; %Flow4 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 ; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v5 ; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4 @@ -1166,8 +1166,8 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 -; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 0xffffffd0, v10 +; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 0xffffffd0, v8 ; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] @@ -1182,54 +1182,54 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_cbranch_execz .LBB8_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[4:5], v2 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9] +; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB8_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v6 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[8:9], v6 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v8 +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[8:9], 0, 0, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: .LBB8_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 -; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc -; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v11, vcc ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 -; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 -; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 -; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v4 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 -; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 -; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v14 +; GCN-IR-NEXT: v_and_b32_e32 v15, v14, v1 +; GCN-IR-NEXT: v_and_b32_e32 v14, v14, v0 +; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3 +; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, v10, v14 +; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 +; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, v11, v15, vcc +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB8_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB8_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB8_5: ; %Flow4 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 ; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 @@ -1262,8 +1262,8 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 ; GCN-IR-NEXT: v_add_i32_e64 v2, s[4:5], 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 -; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3 -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 48, v10 +; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3 +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 48, v8 ; GCN-IR-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5] ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[2:3] @@ -1284,44 +1284,44 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB9_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v6 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffcf, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], 0, -1, vcc -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff +; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v6 +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffcf, v8 +; GCN-IR-NEXT: v_addc_u32_e64 v7, s[8:9], 0, -1, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: s_movk_i32 s10, 0x7fff ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s12, v8 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s10, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v11, vcc ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 -; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 -; GCN-IR-NEXT: v_and_b32_e32 v10, 0x8000, v10 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v4 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 -; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 -; GCN-IR-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v9, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v12 +; GCN-IR-NEXT: v_and_b32_e32 v12, 0x8000, v12 +; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3 +; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, v10, v12 +; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 +; GCN-IR-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v11, vcc +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB9_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB9_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB9_5: ; %Flow4 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 ; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index d8264b5a091e1..f4741ce128105 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -508,12 +508,12 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace( ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX942-NEXT: v_and_b32_e32 v16, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v16 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v16 ; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v16 ; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v16 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[8:9] -; GFX942-NEXT: ; implicit-def: $vgpr2 +; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[8:9] +; GFX942-NEXT: ; implicit-def: $vgpr0 ; GFX942-NEXT: ; implicit-def: $vgpr12 ; GFX942-NEXT: ; implicit-def: $vgpr10 ; GFX942-NEXT: ; implicit-def: $vgpr13 @@ -521,74 +521,75 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace( ; GFX942-NEXT: ; implicit-def: $vgpr11 ; GFX942-NEXT: ; implicit-def: $vgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v4, 24, v1 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX942-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; GFX942-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX942-NEXT: v_lshrrev_b32_e32 v9, 8, v0 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, 24, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 8, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v9, 8, v2 ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX942-NEXT: s_cbranch_execz .LBB10_2 ; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dwordx2 v[2:3], v3, s[10:11] +; GFX942-NEXT: global_load_dwordx2 v[0:1], v1, s[10:11] ; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v16 ; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX942-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX942-NEXT: v_mov_b32_e32 v4, 8 ; GFX942-NEXT: v_mov_b32_e32 v5, 7 ; GFX942-NEXT: v_mov_b32_e32 v6, 6 -; GFX942-NEXT: v_mov_b32_e32 v1, 5 +; GFX942-NEXT: v_mov_b32_e32 v3, 5 ; GFX942-NEXT: v_mov_b32_e32 v7, 4 ; GFX942-NEXT: v_mov_b32_e32 v8, 3 ; GFX942-NEXT: v_mov_b32_e32 v9, 2 -; GFX942-NEXT: v_mov_b32_e32 v0, 1 +; GFX942-NEXT: v_mov_b32_e32 v2, 1 ; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v15, 24, v3 -; GFX942-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX942-NEXT: v_lshrrev_b32_e32 v14, 8, v3 -; GFX942-NEXT: v_lshrrev_b32_e32 v13, 24, v2 -; GFX942-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX942-NEXT: v_lshrrev_b32_e32 v12, 8, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v15, 24, v1 +; GFX942-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GFX942-NEXT: v_lshrrev_b32_e32 v14, 8, v1 +; GFX942-NEXT: v_lshrrev_b32_e32 v13, 24, v0 +; GFX942-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; GFX942-NEXT: v_lshrrev_b32_e32 v12, 8, v0 ; GFX942-NEXT: .LBB10_2: ; %Flow ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX942-NEXT: s_cbranch_execz .LBB10_4 ; GFX942-NEXT: ; %bb.3: ; %bb.2 -; GFX942-NEXT: v_lshlrev_b16_e32 v2, 8, v9 -; GFX942-NEXT: v_lshlrev_b16_e32 v3, 8, v7 -; GFX942-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX942-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-NEXT: v_lshlrev_b16_e32 v0, 8, v9 +; GFX942-NEXT: v_lshlrev_b16_e32 v1, 8, v7 +; GFX942-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX942-NEXT: v_lshlrev_b16_e32 v11, 8, v4 -; GFX942-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX942-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; GFX942-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX942-NEXT: v_lshlrev_b16_e32 v1, 8, v6 +; GFX942-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX942-NEXT: v_or_b32_sdwa v11, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_or_b32_sdwa v3, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[12:13] -; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_or_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[12:13] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v12, v9 ; GFX942-NEXT: v_mov_b32_e32 v10, v8 ; GFX942-NEXT: v_mov_b32_e32 v13, v7 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: v_mov_b32_e32 v14, v6 ; GFX942-NEXT: v_mov_b32_e32 v11, v5 ; GFX942-NEXT: v_mov_b32_e32 v15, v4 ; GFX942-NEXT: .LBB10_4: ; %bb.3 ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_lshlrev_b16_e32 v0, 8, v12 -; GFX942-NEXT: v_lshlrev_b16_e32 v1, 8, v13 -; GFX942-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX942-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX942-NEXT: v_lshlrev_b16_e32 v2, 8, v15 -; GFX942-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX942-NEXT: v_lshlrev_b16_e32 v1, 8, v14 -; GFX942-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX942-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[14:15] +; GFX942-NEXT: v_lshlrev_b16_e32 v3, 8, v12 +; GFX942-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-NEXT: v_lshlrev_b16_e32 v3, 8, v13 +; GFX942-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX942-NEXT: v_lshlrev_b16_e32 v3, 8, v14 +; GFX942-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-NEXT: v_lshlrev_b16_e32 v3, 8, v15 +; GFX942-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[14:15] ; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 097154ed23ede..8236442f8fe53 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1292,23 +1292,23 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, p ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0 ; GFX1064-NEXT: s_mov_b64 vcc, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dwordx3 v[1:3], v1, s[10:11] -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB22_2 ; GFX1064-NEXT: ; %bb.1: ; %bb ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: global_load_dword v0, v0, s[6:7] glc dlc +; GFX1064-NEXT: global_load_dword v0, v0, s[2:3] glc dlc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_b64 vcc, vcc, exec ; GFX1064-NEXT: .LBB22_2: ; %exit ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v3 diff --git a/llvm/test/CodeGen/ARM/and-cmp0-sink.ll b/llvm/test/CodeGen/ARM/and-cmp0-sink.ll index fb9139c0d1285..389193f9d2407 100644 --- a/llvm/test/CodeGen/ARM/and-cmp0-sink.ll +++ b/llvm/test/CodeGen/ARM/and-cmp0-sink.ll @@ -112,20 +112,20 @@ define void @f(i32 %v, ptr noalias %outp) { ; V6M-NEXT: push {r4, lr} ; V6M-NEXT: movs r2, #0 ; V6M-NEXT: str r2, [r1] -; V6M-NEXT: movs r3, #14 -; V6M-NEXT: ands r3, r0 +; V6M-NEXT: movs r2, #14 +; V6M-NEXT: ands r2, r0 ; V6M-NEXT: movs r4, #4 ; V6M-NEXT: ands r4, r0 -; V6M-NEXT: movs r2, #2 -; V6M-NEXT: ands r2, r0 +; V6M-NEXT: movs r3, #2 +; V6M-NEXT: ands r3, r0 ; V6M-NEXT: lsls r0, r0, #31 ; V6M-NEXT: bne .LBB0_5 ; V6M-NEXT: @ %bb.1: @ %if.then ; V6M-NEXT: movs r0, #129 -; V6M-NEXT: cmp r2, #0 +; V6M-NEXT: cmp r3, #0 ; V6M-NEXT: beq .LBB0_3 ; V6M-NEXT: @ %bb.2: -; V6M-NEXT: lsls r2, r0, #8 +; V6M-NEXT: lsls r3, r0, #8 ; V6M-NEXT: .LBB0_3: @ %if.then ; V6M-NEXT: cmp r4, #0 ; V6M-NEXT: beq .LBB0_10 @@ -134,22 +134,22 @@ define void @f(i32 %v, ptr noalias %outp) { ; V6M-NEXT: b .LBB0_9 ; V6M-NEXT: .LBB0_5: @ %if.else ; V6M-NEXT: movs r0, #129 -; V6M-NEXT: cmp r2, #0 +; V6M-NEXT: cmp r3, #0 ; V6M-NEXT: beq .LBB0_7 ; V6M-NEXT: @ %bb.6: -; V6M-NEXT: lsls r2, r0, #6 +; V6M-NEXT: lsls r3, r0, #6 ; V6M-NEXT: .LBB0_7: @ %if.else ; V6M-NEXT: cmp r4, #0 ; V6M-NEXT: beq .LBB0_10 ; V6M-NEXT: @ %bb.8: @ %if.else ; V6M-NEXT: lsls r0, r0, #5 ; V6M-NEXT: .LBB0_9: @ %if.else -; V6M-NEXT: adds r2, r2, r0 +; V6M-NEXT: adds r3, r3, r0 ; V6M-NEXT: .LBB0_10: @ %if.else -; V6M-NEXT: cmp r3, #0 +; V6M-NEXT: cmp r2, #0 ; V6M-NEXT: beq .LBB0_12 ; V6M-NEXT: @ %bb.11: @ %if.end -; V6M-NEXT: str r2, [r1] +; V6M-NEXT: str r3, [r1] ; V6M-NEXT: .LBB0_12: @ %exit ; V6M-NEXT: pop {r4, pc} entry: diff --git a/llvm/test/CodeGen/ARM/cttz.ll b/llvm/test/CodeGen/ARM/cttz.ll index 1146ad64ee709..e1bf4837d6a47 100644 --- a/llvm/test/CodeGen/ARM/cttz.ll +++ b/llvm/test/CodeGen/ARM/cttz.ll @@ -229,24 +229,24 @@ define i64 @test_i64(i64 %a) { ; CHECK-6M-NEXT: orrs r0, r3 ; CHECK-6M-NEXT: beq .LBB3_6 ; CHECK-6M-NEXT: @ %bb.1: @ %cond.false -; CHECK-6M-NEXT: ldr r6, .LCPI3_0 +; CHECK-6M-NEXT: ldr r5, .LCPI3_0 ; CHECK-6M-NEXT: adr r4, .LCPI3_1 ; CHECK-6M-NEXT: movs r0, #32 ; CHECK-6M-NEXT: cmp r3, #0 -; CHECK-6M-NEXT: mov r5, r0 +; CHECK-6M-NEXT: mov r6, r0 ; CHECK-6M-NEXT: beq .LBB3_3 ; CHECK-6M-NEXT: @ %bb.2: @ %cond.false -; CHECK-6M-NEXT: rsbs r5, r3, #0 -; CHECK-6M-NEXT: ands r5, r3 -; CHECK-6M-NEXT: muls r5, r6, r5 -; CHECK-6M-NEXT: lsrs r3, r5, #27 -; CHECK-6M-NEXT: ldrb r5, [r4, r3] +; CHECK-6M-NEXT: rsbs r6, r3, #0 +; CHECK-6M-NEXT: ands r6, r3 +; CHECK-6M-NEXT: muls r6, r5, r6 +; CHECK-6M-NEXT: lsrs r3, r6, #27 +; CHECK-6M-NEXT: ldrb r6, [r4, r3] ; CHECK-6M-NEXT: .LBB3_3: @ %cond.false -; CHECK-6M-NEXT: adds r5, #32 +; CHECK-6M-NEXT: adds r6, #32 ; CHECK-6M-NEXT: rsbs r3, r2, #0 ; CHECK-6M-NEXT: ands r3, r2 -; CHECK-6M-NEXT: muls r6, r3, r6 -; CHECK-6M-NEXT: lsrs r3, r6, #27 +; CHECK-6M-NEXT: muls r5, r3, r5 +; CHECK-6M-NEXT: lsrs r3, r5, #27 ; CHECK-6M-NEXT: cmp r2, #0 ; CHECK-6M-NEXT: bne .LBB3_7 ; CHECK-6M-NEXT: @ %bb.4: @ %cond.false @@ -260,7 +260,7 @@ define i64 @test_i64(i64 %a) { ; CHECK-6M-NEXT: ldrb r0, [r4, r3] ; CHECK-6M-NEXT: bne .LBB3_5 ; CHECK-6M-NEXT: .LBB3_8: @ %cond.false -; CHECK-6M-NEXT: mov r0, r5 +; CHECK-6M-NEXT: mov r0, r6 ; CHECK-6M-NEXT: pop {r4, r5, r6, pc} ; CHECK-6M-NEXT: .p2align 2 ; CHECK-6M-NEXT: @ %bb.9: @@ -279,24 +279,24 @@ define i64 @test_i64(i64 %a) { ; CHECK-8MBASE-NEXT: orrs r0, r3 ; CHECK-8MBASE-NEXT: beq .LBB3_6 ; CHECK-8MBASE-NEXT: @ %bb.1: @ %cond.false -; CHECK-8MBASE-NEXT: movw r6, #46385 -; CHECK-8MBASE-NEXT: movt r6, #1916 +; CHECK-8MBASE-NEXT: movw r5, #46385 +; CHECK-8MBASE-NEXT: movt r5, #1916 ; CHECK-8MBASE-NEXT: adr r4, .LCPI3_0 ; CHECK-8MBASE-NEXT: movs r0, #32 -; CHECK-8MBASE-NEXT: mov r5, r0 +; CHECK-8MBASE-NEXT: mov r6, r0 ; CHECK-8MBASE-NEXT: cbz r3, .LBB3_3 ; CHECK-8MBASE-NEXT: @ %bb.2: @ %cond.false -; CHECK-8MBASE-NEXT: rsbs r5, r3, #0 -; CHECK-8MBASE-NEXT: ands r5, r3 -; CHECK-8MBASE-NEXT: muls r5, r6, r5 -; CHECK-8MBASE-NEXT: lsrs r3, r5, #27 -; CHECK-8MBASE-NEXT: ldrb r5, [r4, r3] +; CHECK-8MBASE-NEXT: rsbs r6, r3, #0 +; CHECK-8MBASE-NEXT: ands r6, r3 +; CHECK-8MBASE-NEXT: muls r6, r5, r6 +; CHECK-8MBASE-NEXT: lsrs r3, r6, #27 +; CHECK-8MBASE-NEXT: ldrb r6, [r4, r3] ; CHECK-8MBASE-NEXT: .LBB3_3: @ %cond.false -; CHECK-8MBASE-NEXT: adds r5, #32 +; CHECK-8MBASE-NEXT: adds r6, #32 ; CHECK-8MBASE-NEXT: rsbs r3, r2, #0 ; CHECK-8MBASE-NEXT: ands r3, r2 -; CHECK-8MBASE-NEXT: muls r6, r3, r6 -; CHECK-8MBASE-NEXT: lsrs r3, r6, #27 +; CHECK-8MBASE-NEXT: muls r5, r3, r5 +; CHECK-8MBASE-NEXT: lsrs r3, r5, #27 ; CHECK-8MBASE-NEXT: cmp r2, #0 ; CHECK-8MBASE-NEXT: bne .LBB3_7 ; CHECK-8MBASE-NEXT: @ %bb.4: @ %cond.false @@ -310,7 +310,7 @@ define i64 @test_i64(i64 %a) { ; CHECK-8MBASE-NEXT: ldrb r0, [r4, r3] ; CHECK-8MBASE-NEXT: bne .LBB3_5 ; CHECK-8MBASE-NEXT: .LBB3_8: @ %cond.false -; CHECK-8MBASE-NEXT: mov r0, r5 +; CHECK-8MBASE-NEXT: mov r0, r6 ; CHECK-8MBASE-NEXT: pop {r4, r5, r6, pc} ; CHECK-8MBASE-NEXT: .p2align 2 ; CHECK-8MBASE-NEXT: @ %bb.9: @@ -503,24 +503,24 @@ define i64 @test_i64_zero_undef(i64 %a) { ; CHECK-6M-NEXT: .save {r4, r5, r7, lr} ; CHECK-6M-NEXT: push {r4, r5, r7, lr} ; CHECK-6M-NEXT: mov r2, r0 -; CHECK-6M-NEXT: ldr r5, .LCPI7_0 +; CHECK-6M-NEXT: ldr r4, .LCPI7_0 ; CHECK-6M-NEXT: adr r3, .LCPI7_1 ; CHECK-6M-NEXT: movs r0, #32 ; CHECK-6M-NEXT: cmp r1, #0 -; CHECK-6M-NEXT: mov r4, r0 +; CHECK-6M-NEXT: mov r5, r0 ; CHECK-6M-NEXT: beq .LBB7_2 ; CHECK-6M-NEXT: @ %bb.1: -; CHECK-6M-NEXT: rsbs r4, r1, #0 -; CHECK-6M-NEXT: ands r4, r1 -; CHECK-6M-NEXT: muls r4, r5, r4 -; CHECK-6M-NEXT: lsrs r1, r4, #27 -; CHECK-6M-NEXT: ldrb r4, [r3, r1] +; CHECK-6M-NEXT: rsbs r5, r1, #0 +; CHECK-6M-NEXT: ands r5, r1 +; CHECK-6M-NEXT: muls r5, r4, r5 +; CHECK-6M-NEXT: lsrs r1, r5, #27 +; CHECK-6M-NEXT: ldrb r5, [r3, r1] ; CHECK-6M-NEXT: .LBB7_2: -; CHECK-6M-NEXT: adds r4, #32 +; CHECK-6M-NEXT: adds r5, #32 ; CHECK-6M-NEXT: rsbs r1, r2, #0 ; CHECK-6M-NEXT: ands r1, r2 -; CHECK-6M-NEXT: muls r5, r1, r5 -; CHECK-6M-NEXT: lsrs r1, r5, #27 +; CHECK-6M-NEXT: muls r4, r1, r4 +; CHECK-6M-NEXT: lsrs r1, r4, #27 ; CHECK-6M-NEXT: cmp r2, #0 ; CHECK-6M-NEXT: bne .LBB7_5 ; CHECK-6M-NEXT: @ %bb.3: @@ -532,7 +532,7 @@ define i64 @test_i64_zero_undef(i64 %a) { ; CHECK-6M-NEXT: ldrb r0, [r3, r1] ; CHECK-6M-NEXT: bne .LBB7_4 ; CHECK-6M-NEXT: .LBB7_6: -; CHECK-6M-NEXT: mov r0, r4 +; CHECK-6M-NEXT: mov r0, r5 ; CHECK-6M-NEXT: movs r1, #0 ; CHECK-6M-NEXT: pop {r4, r5, r7, pc} ; CHECK-6M-NEXT: .p2align 2 @@ -547,24 +547,24 @@ define i64 @test_i64_zero_undef(i64 %a) { ; CHECK-8MBASE-NEXT: .save {r4, r5, r7, lr} ; CHECK-8MBASE-NEXT: push {r4, r5, r7, lr} ; CHECK-8MBASE-NEXT: mov r2, r0 -; CHECK-8MBASE-NEXT: movw r5, #46385 -; CHECK-8MBASE-NEXT: movt r5, #1916 +; CHECK-8MBASE-NEXT: movw r4, #46385 +; CHECK-8MBASE-NEXT: movt r4, #1916 ; CHECK-8MBASE-NEXT: adr r3, .LCPI7_0 ; CHECK-8MBASE-NEXT: movs r0, #32 -; CHECK-8MBASE-NEXT: mov r4, r0 +; CHECK-8MBASE-NEXT: mov r5, r0 ; CHECK-8MBASE-NEXT: cbz r1, .LBB7_2 ; CHECK-8MBASE-NEXT: @ %bb.1: -; CHECK-8MBASE-NEXT: rsbs r4, r1, #0 -; CHECK-8MBASE-NEXT: ands r4, r1 -; CHECK-8MBASE-NEXT: muls r4, r5, r4 -; CHECK-8MBASE-NEXT: lsrs r1, r4, #27 -; CHECK-8MBASE-NEXT: ldrb r4, [r3, r1] +; CHECK-8MBASE-NEXT: rsbs r5, r1, #0 +; CHECK-8MBASE-NEXT: ands r5, r1 +; CHECK-8MBASE-NEXT: muls r5, r4, r5 +; CHECK-8MBASE-NEXT: lsrs r1, r5, #27 +; CHECK-8MBASE-NEXT: ldrb r5, [r3, r1] ; CHECK-8MBASE-NEXT: .LBB7_2: -; CHECK-8MBASE-NEXT: adds r4, #32 +; CHECK-8MBASE-NEXT: adds r5, #32 ; CHECK-8MBASE-NEXT: rsbs r1, r2, #0 ; CHECK-8MBASE-NEXT: ands r1, r2 -; CHECK-8MBASE-NEXT: muls r5, r1, r5 -; CHECK-8MBASE-NEXT: lsrs r1, r5, #27 +; CHECK-8MBASE-NEXT: muls r4, r1, r4 +; CHECK-8MBASE-NEXT: lsrs r1, r4, #27 ; CHECK-8MBASE-NEXT: cmp r2, #0 ; CHECK-8MBASE-NEXT: bne .LBB7_5 ; CHECK-8MBASE-NEXT: @ %bb.3: @@ -576,7 +576,7 @@ define i64 @test_i64_zero_undef(i64 %a) { ; CHECK-8MBASE-NEXT: ldrb r0, [r3, r1] ; CHECK-8MBASE-NEXT: bne .LBB7_4 ; CHECK-8MBASE-NEXT: .LBB7_6: -; CHECK-8MBASE-NEXT: mov r0, r4 +; CHECK-8MBASE-NEXT: mov r0, r5 ; CHECK-8MBASE-NEXT: movs r1, #0 ; CHECK-8MBASE-NEXT: pop {r4, r5, r7, pc} ; CHECK-8MBASE-NEXT: .p2align 2 diff --git a/llvm/test/CodeGen/ARM/select-imm.ll b/llvm/test/CodeGen/ARM/select-imm.ll index 186276b50ceeb..2bef1c83d7969 100644 --- a/llvm/test/CodeGen/ARM/select-imm.ll +++ b/llvm/test/CodeGen/ARM/select-imm.ll @@ -455,13 +455,13 @@ define void @t9(ptr %a, i8 %b) { ; ARMT2-NEXT: cmp r0, r0 ; ARMT2-NEXT: popne {r4, pc} ; ARMT2-NEXT: .LBB8_1: @ %while.body.preheader -; ARMT2-NEXT: add r1, r4, #1 -; ARMT2-NEXT: mov r2, r0 +; ARMT2-NEXT: mov r1, r0 +; ARMT2-NEXT: add r2, r4, #1 ; ARMT2-NEXT: .LBB8_2: @ %while.body ; ARMT2-NEXT: @ =>This Inner Loop Header: Depth=1 -; ARMT2-NEXT: add r2, r2, #1 ; ARMT2-NEXT: add r1, r1, #1 -; ARMT2-NEXT: uxtb r3, r2 +; ARMT2-NEXT: add r2, r2, #1 +; ARMT2-NEXT: uxtb r3, r1 ; ARMT2-NEXT: cmp r3, r0 ; ARMT2-NEXT: blt .LBB8_2 ; ARMT2-NEXT: @ %bb.3: @ %while.end @@ -503,13 +503,13 @@ define void @t9(ptr %a, i8 %b) { ; THUMB2-NEXT: it ne ; THUMB2-NEXT: popne {r4, pc} ; THUMB2-NEXT: .LBB8_1: @ %while.body.preheader -; THUMB2-NEXT: adds r1, r4, #1 -; THUMB2-NEXT: mov r2, r0 +; THUMB2-NEXT: mov r1, r0 +; THUMB2-NEXT: adds r2, r4, #1 ; THUMB2-NEXT: .LBB8_2: @ %while.body ; THUMB2-NEXT: @ =>This Inner Loop Header: Depth=1 -; THUMB2-NEXT: adds r2, #1 ; THUMB2-NEXT: adds r1, #1 -; THUMB2-NEXT: uxtb r3, r2 +; THUMB2-NEXT: adds r2, #1 +; THUMB2-NEXT: uxtb r3, r1 ; THUMB2-NEXT: cmp r3, r0 ; THUMB2-NEXT: blt .LBB8_2 ; THUMB2-NEXT: @ %bb.3: @ %while.end diff --git a/llvm/test/CodeGen/ARM/struct-byval-loop.ll b/llvm/test/CodeGen/ARM/struct-byval-loop.ll index 7a38dec2434f7..a90381acf4214 100644 --- a/llvm/test/CodeGen/ARM/struct-byval-loop.ll +++ b/llvm/test/CodeGen/ARM/struct-byval-loop.ll @@ -13,13 +13,13 @@ define void @test_80() { ; CHECK-NEXT: .pad #152 ; CHECK-NEXT: sub sp, sp, #152 ; CHECK-NEXT: add r0, sp, #72 -; CHECK-NEXT: ldr r1, .LCPI0_0 +; CHECK-NEXT: ldr r2, .LCPI0_0 ; CHECK-NEXT: add r0, r0, #12 -; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: mov r1, sp ; CHECK-NEXT: .LBB0_1: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r3, [r0], #4 -; CHECK-NEXT: subs r1, r1, #4 -; CHECK-NEXT: str r3, [r2], #4 +; CHECK-NEXT: subs r2, r2, #4 +; CHECK-NEXT: str r3, [r1], #4 ; CHECK-NEXT: bne .LBB0_1 ; CHECK-NEXT: @ %bb.2: ; CHECK-NEXT: add r3, sp, #72 @@ -52,14 +52,14 @@ define void @test_4000() { ; CHECK-NEXT: sub sp, sp, #920 ; CHECK-NEXT: sub sp, sp, #3072 ; CHECK-NEXT: add lr, sp, #3072 -; CHECK-NEXT: ldr r1, .LCPI1_0 +; CHECK-NEXT: ldr r2, .LCPI1_0 ; CHECK-NEXT: add r0, lr, #920 -; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: mov r1, sp ; CHECK-NEXT: add r0, r0, #12 ; CHECK-NEXT: .LBB1_1: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r3, [r0], #4 -; CHECK-NEXT: subs r1, r1, #4 -; CHECK-NEXT: str r3, [r2], #4 +; CHECK-NEXT: subs r2, r2, #4 +; CHECK-NEXT: str r3, [r1], #4 ; CHECK-NEXT: bne .LBB1_1 ; CHECK-NEXT: @ %bb.2: ; CHECK-NEXT: ldr r1, [sp, #3992] diff --git a/llvm/test/CodeGen/ARM/swifterror.ll b/llvm/test/CodeGen/ARM/swifterror.ll index f002c54fc60c0..259c20c8c9af6 100644 --- a/llvm/test/CodeGen/ARM/swifterror.ll +++ b/llvm/test/CodeGen/ARM/swifterror.ll @@ -79,17 +79,17 @@ define float @caller(ptr %error_ref) { ; ; CHECK-O0-LABEL: caller: ; CHECK-O0: @ %bb.0: @ %entry -; CHECK-O0-NEXT: push {r7, r8, lr} -; CHECK-O0-NEXT: add r7, sp, #4 -; CHECK-O0-NEXT: sub sp, sp, #12 +; CHECK-O0-NEXT: push {r7, r8, lr} +; CHECK-O0-NEXT: add r7, sp, #4 +; CHECK-O0-NEXT: sub sp, sp, #12 ; CHECK-O0-NEXT: @ implicit-def: $r1 -; CHECK-O0-NEXT: str r0, [sp] @ 4-byte Spill -; CHECK-O0-NEXT: mov r8, #0 -; CHECK-O0-NEXT: bl _foo -; CHECK-O0-NEXT: str r8, [sp, #4] @ 4-byte Spill -; CHECK-O0-NEXT: movw r0, #0 -; CHECK-O0-NEXT: cmp r8, r0 -; CHECK-O0-NEXT: bne LBB1_2 +; CHECK-O0-NEXT: str r0, [sp] @ 4-byte Spill +; CHECK-O0-NEXT: mov r8, #0 +; CHECK-O0-NEXT: bl _foo +; CHECK-O0-NEXT: str r8, [sp, #4] @ 4-byte Spill +; CHECK-O0-NEXT: movw r0, #0 +; CHECK-O0-NEXT: cmp r8, r0 +; CHECK-O0-NEXT: bne LBB1_2 ; CHECK-O0-NEXT: @ %bb.1: @ %cont ; CHECK-O0-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-O0-NEXT: ldr r0, [sp, #4] @ 4-byte Reload @@ -100,7 +100,7 @@ define float @caller(ptr %error_ref) { ; CHECK-O0-NEXT: bl _free ; CHECK-O0-NEXT: mov r0, #1065353216 ; CHECK-O0-NEXT: sub sp, r7, #4 -; CHECK-O0-NEXT: pop {r7, r8, pc} +; CHECK-O0-NEXT: pop {r7, r8, pc} ; ; CHECK-ANDROID-LABEL: caller: ; CHECK-ANDROID: @ %bb.0: @ %entry @@ -174,11 +174,11 @@ define float @caller2(ptr %error_ref) { ; ; CHECK-O0-LABEL: caller2: ; CHECK-O0: @ %bb.0: @ %entry -; CHECK-O0-NEXT: push {r7, r8, lr} -; CHECK-O0-NEXT: add r7, sp, #4 -; CHECK-O0-NEXT: sub sp, sp, #16 +; CHECK-O0-NEXT: push {r7, r8, lr} +; CHECK-O0-NEXT: add r7, sp, #4 +; CHECK-O0-NEXT: sub sp, sp, #16 ; CHECK-O0-NEXT: @ implicit-def: $r1 -; CHECK-O0-NEXT: str r0, [sp, #8] @ 4-byte Spill +; CHECK-O0-NEXT: str r0, [sp, #8] @ 4-byte Spill ; CHECK-O0-NEXT: LBB2_1: @ %bb_loop ; CHECK-O0-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-O0-NEXT: mov r8, #0 @@ -206,7 +206,7 @@ define float @caller2(ptr %error_ref) { ; CHECK-O0-NEXT: bl _free ; CHECK-O0-NEXT: mov r0, #1065353216 ; CHECK-O0-NEXT: sub sp, r7, #4 -; CHECK-O0-NEXT: pop {r7, r8, pc} +; CHECK-O0-NEXT: pop {r7, r8, pc} ; ; CHECK-ANDROID-LABEL: caller2: ; CHECK-ANDROID: @ %bb.0: @ %entry @@ -400,35 +400,35 @@ define float @foo_loop(ptr swifterror %error_ptr_ref, i32 %cc, float %cc2) { ; CHECK-O0-NEXT: mov r7, sp ; CHECK-O0-NEXT: sub sp, sp, #20 ; CHECK-O0-NEXT: str r0, [sp, #8] @ 4-byte Spill +; CHECK-O0-NEXT: str r8, [r7, #-8] @ 4-byte Spill ; CHECK-O0-NEXT: vmov s0, r1 -; CHECK-O0-NEXT: vstr s0, [r7, #-8] @ 4-byte Spill -; CHECK-O0-NEXT: str r8, [r7, #-4] @ 4-byte Spill +; CHECK-O0-NEXT: vstr s0, [r7, #-4] @ 4-byte Spill ; CHECK-O0-NEXT: b LBB4_1 ; CHECK-O0-NEXT: LBB4_1: @ %bb_loop ; CHECK-O0-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-O0-NEXT: ldr r1, [sp, #8] @ 4-byte Reload -; CHECK-O0-NEXT: ldr r0, [r7, #-4] @ 4-byte Reload -; CHECK-O0-NEXT: cmp r1, #0 -; CHECK-O0-NEXT: str r0, [sp, #4] @ 4-byte Spill +; CHECK-O0-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-O0-NEXT: ldr r1, [r7, #-8] @ 4-byte Reload +; CHECK-O0-NEXT: str r1, [sp, #4] @ 4-byte Spill +; CHECK-O0-NEXT: cmp r0, #0 ; CHECK-O0-NEXT: beq LBB4_3 ; CHECK-O0-NEXT: @ %bb.2: @ %gen_error ; CHECK-O0-NEXT: @ in Loop: Header=BB4_1 Depth=1 ; CHECK-O0-NEXT: mov r0, #16 ; CHECK-O0-NEXT: mov r1, #0 ; CHECK-O0-NEXT: bl _malloc -; CHECK-O0-NEXT: mov r2, r0 -; CHECK-O0-NEXT: movw r1, #1 -; CHECK-O0-NEXT: strb r1, [r2, #8] +; CHECK-O0-NEXT: mov r1, r0 ; CHECK-O0-NEXT: str r0, [sp, #4] @ 4-byte Spill +; CHECK-O0-NEXT: movw r0, #1 +; CHECK-O0-NEXT: strb r0, [r1, #8] ; CHECK-O0-NEXT: LBB4_3: @ %bb_cont ; CHECK-O0-NEXT: @ in Loop: Header=BB4_1 Depth=1 -; CHECK-O0-NEXT: vldr s0, [r7, #-8] @ 4-byte Reload +; CHECK-O0-NEXT: vldr s0, [r7, #-4] @ 4-byte Reload ; CHECK-O0-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-O0-NEXT: str r0, [sp] @ 4-byte Spill ; CHECK-O0-NEXT: vmov.f32 s2, #1.000000e+00 ; CHECK-O0-NEXT: vcmp.f32 s0, s2 ; CHECK-O0-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-O0-NEXT: str r0, [r7, #-4] @ 4-byte Spill +; CHECK-O0-NEXT: str r0, [r7, #-8] @ 4-byte Spill ; CHECK-O0-NEXT: ble LBB4_1 ; CHECK-O0-NEXT: @ %bb.4: @ %bb_end ; CHECK-O0-NEXT: ldr r8, [sp] @ 4-byte Reload @@ -581,20 +581,20 @@ define float @caller3(ptr %error_ref) { ; ; CHECK-O0-LABEL: caller3: ; CHECK-O0: @ %bb.0: @ %entry -; CHECK-O0-NEXT: push {r7, r8, lr} -; CHECK-O0-NEXT: add r7, sp, #4 -; CHECK-O0-NEXT: sub sp, sp, #44 -; CHECK-O0-NEXT: bfc sp, #0, #3 +; CHECK-O0-NEXT: push {r7, r8, lr} +; CHECK-O0-NEXT: add r7, sp, #4 +; CHECK-O0-NEXT: sub sp, sp, #44 +; CHECK-O0-NEXT: bfc sp, #0, #3 ; CHECK-O0-NEXT: @ implicit-def: $r1 -; CHECK-O0-NEXT: str r0, [sp, #4] @ 4-byte Spill -; CHECK-O0-NEXT: mov r8, #0 -; CHECK-O0-NEXT: add r0, sp, #16 -; CHECK-O0-NEXT: mov r1, #1 -; CHECK-O0-NEXT: bl _foo_sret -; CHECK-O0-NEXT: str r8, [sp, #8] @ 4-byte Spill -; CHECK-O0-NEXT: movw r0, #0 -; CHECK-O0-NEXT: cmp r8, r0 -; CHECK-O0-NEXT: bne LBB6_2 +; CHECK-O0-NEXT: str r0, [sp, #4] @ 4-byte Spill +; CHECK-O0-NEXT: mov r8, #0 +; CHECK-O0-NEXT: add r0, sp, #16 +; CHECK-O0-NEXT: mov r1, #1 +; CHECK-O0-NEXT: bl _foo_sret +; CHECK-O0-NEXT: str r8, [sp, #8] @ 4-byte Spill +; CHECK-O0-NEXT: movw r0, #0 +; CHECK-O0-NEXT: cmp r8, r0 +; CHECK-O0-NEXT: bne LBB6_2 ; CHECK-O0-NEXT: @ %bb.1: @ %cont ; CHECK-O0-NEXT: ldr r1, [sp, #4] @ 4-byte Reload ; CHECK-O0-NEXT: ldr r0, [sp, #8] @ 4-byte Reload @@ -605,7 +605,7 @@ define float @caller3(ptr %error_ref) { ; CHECK-O0-NEXT: bl _free ; CHECK-O0-NEXT: mov r0, #1065353216 ; CHECK-O0-NEXT: sub sp, r7, #4 -; CHECK-O0-NEXT: pop {r7, r8, pc} +; CHECK-O0-NEXT: pop {r7, r8, pc} ; ; CHECK-ANDROID-LABEL: caller3: ; CHECK-ANDROID: @ %bb.0: @ %entry @@ -803,26 +803,26 @@ define float @caller4(ptr %error_ref) { ; ; CHECK-O0-LABEL: caller4: ; CHECK-O0: @ %bb.0: @ %entry -; CHECK-O0-NEXT: push {r7, r8, lr} -; CHECK-O0-NEXT: add r7, sp, #4 -; CHECK-O0-NEXT: sub sp, sp, #24 +; CHECK-O0-NEXT: push {r7, r8, lr} +; CHECK-O0-NEXT: add r7, sp, #4 +; CHECK-O0-NEXT: sub sp, sp, #24 ; CHECK-O0-NEXT: @ implicit-def: $r1 -; CHECK-O0-NEXT: str r0, [sp] @ 4-byte Spill -; CHECK-O0-NEXT: mov r8, #0 -; CHECK-O0-NEXT: mov r0, #10 -; CHECK-O0-NEXT: str r0, [r7, #-12] -; CHECK-O0-NEXT: mov r0, #11 -; CHECK-O0-NEXT: str r0, [sp, #12] -; CHECK-O0-NEXT: mov r0, #12 -; CHECK-O0-NEXT: str r0, [sp, #8] -; CHECK-O0-NEXT: ldr r0, [r7, #-12] -; CHECK-O0-NEXT: ldr r1, [sp, #12] -; CHECK-O0-NEXT: ldr r2, [sp, #8] -; CHECK-O0-NEXT: bl _foo_vararg -; CHECK-O0-NEXT: str r8, [sp, #4] @ 4-byte Spill -; CHECK-O0-NEXT: movw r0, #0 -; CHECK-O0-NEXT: cmp r8, r0 -; CHECK-O0-NEXT: bne LBB8_2 +; CHECK-O0-NEXT: str r0, [sp] @ 4-byte Spill +; CHECK-O0-NEXT: mov r8, #0 +; CHECK-O0-NEXT: mov r0, #10 +; CHECK-O0-NEXT: str r0, [r7, #-12] +; CHECK-O0-NEXT: mov r0, #11 +; CHECK-O0-NEXT: str r0, [sp, #12] +; CHECK-O0-NEXT: mov r0, #12 +; CHECK-O0-NEXT: str r0, [sp, #8] +; CHECK-O0-NEXT: ldr r0, [r7, #-12] +; CHECK-O0-NEXT: ldr r1, [sp, #12] +; CHECK-O0-NEXT: ldr r2, [sp, #8] +; CHECK-O0-NEXT: bl _foo_vararg +; CHECK-O0-NEXT: str r8, [sp, #4] @ 4-byte Spill +; CHECK-O0-NEXT: movw r0, #0 +; CHECK-O0-NEXT: cmp r8, r0 +; CHECK-O0-NEXT: bne LBB8_2 ; CHECK-O0-NEXT: @ %bb.1: @ %cont ; CHECK-O0-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-O0-NEXT: ldr r0, [sp, #4] @ 4-byte Reload @@ -833,7 +833,7 @@ define float @caller4(ptr %error_ref) { ; CHECK-O0-NEXT: bl _free ; CHECK-O0-NEXT: mov r0, #1065353216 ; CHECK-O0-NEXT: sub sp, r7, #4 -; CHECK-O0-NEXT: pop {r7, r8, pc} +; CHECK-O0-NEXT: pop {r7, r8, pc} ; ; CHECK-ANDROID-LABEL: caller4: ; CHECK-ANDROID: @ %bb.0: @ %entry @@ -987,12 +987,12 @@ define swiftcc void @swifterror_reg_clobber(ptr nocapture %err) { ; ; CHECK-O0-LABEL: swifterror_reg_clobber: ; CHECK-O0: @ %bb.0: -; CHECK-O0-NEXT: push {r7, r8, lr} -; CHECK-O0-NEXT: add r7, sp, #4 +; CHECK-O0-NEXT: push {r7, r8, lr} +; CHECK-O0-NEXT: add r7, sp, #4 ; CHECK-O0-NEXT: @ InlineAsm Start ; CHECK-O0-NEXT: nop ; CHECK-O0-NEXT: @ InlineAsm End -; CHECK-O0-NEXT: pop {r7, r8, pc} +; CHECK-O0-NEXT: pop {r7, r8, pc} ; ; CHECK-ANDROID-LABEL: swifterror_reg_clobber: ; CHECK-ANDROID: @ %bb.0: @@ -1038,34 +1038,34 @@ define swiftcc void @params_in_reg(i32, i32, i32, i32, ptr swiftself, ptr nocapt ; ; CHECK-O0-LABEL: params_in_reg: ; CHECK-O0: @ %bb.0: -; CHECK-O0-NEXT: push {r7, r10, lr} -; CHECK-O0-NEXT: add r7, sp, #4 -; CHECK-O0-NEXT: sub sp, sp, #28 -; CHECK-O0-NEXT: bfc sp, #0, #3 -; CHECK-O0-NEXT: str r8, [sp, #20] @ 4-byte Spill -; CHECK-O0-NEXT: str r10, [sp] @ 4-byte Spill -; CHECK-O0-NEXT: str r3, [sp, #16] @ 4-byte Spill -; CHECK-O0-NEXT: str r2, [sp, #12] @ 4-byte Spill -; CHECK-O0-NEXT: str r1, [sp, #8] @ 4-byte Spill -; CHECK-O0-NEXT: str r0, [sp, #4] @ 4-byte Spill +; CHECK-O0-NEXT: push {r7, r10, lr} +; CHECK-O0-NEXT: add r7, sp, #4 +; CHECK-O0-NEXT: sub sp, sp, #28 +; CHECK-O0-NEXT: bfc sp, #0, #3 +; CHECK-O0-NEXT: str r8, [sp, #20] @ 4-byte Spill +; CHECK-O0-NEXT: str r10, [sp] @ 4-byte Spill +; CHECK-O0-NEXT: str r3, [sp, #16] @ 4-byte Spill +; CHECK-O0-NEXT: str r2, [sp, #12] @ 4-byte Spill +; CHECK-O0-NEXT: str r1, [sp, #8] @ 4-byte Spill +; CHECK-O0-NEXT: str r0, [sp, #4] @ 4-byte Spill ; CHECK-O0-NEXT: @ implicit-def: $r0 -; CHECK-O0-NEXT: mov r8, #0 -; CHECK-O0-NEXT: mov r0, #1 -; CHECK-O0-NEXT: mov r1, #2 -; CHECK-O0-NEXT: mov r2, #3 -; CHECK-O0-NEXT: mov r3, #4 -; CHECK-O0-NEXT: mov r10, r8 -; CHECK-O0-NEXT: bl _params_in_reg2 -; CHECK-O0-NEXT: ldr r10, [sp] @ 4-byte Reload -; CHECK-O0-NEXT: ldr r0, [sp, #4] @ 4-byte Reload -; CHECK-O0-NEXT: ldr r1, [sp, #8] @ 4-byte Reload -; CHECK-O0-NEXT: ldr r2, [sp, #12] @ 4-byte Reload -; CHECK-O0-NEXT: ldr r3, [sp, #16] @ 4-byte Reload -; CHECK-O0-NEXT: mov r9, r8 -; CHECK-O0-NEXT: ldr r8, [sp, #20] @ 4-byte Reload -; CHECK-O0-NEXT: bl _params_in_reg2 -; CHECK-O0-NEXT: sub sp, r7, #4 -; CHECK-O0-NEXT: pop {r7, r10, pc} +; CHECK-O0-NEXT: mov r8, #0 +; CHECK-O0-NEXT: mov r0, #1 +; CHECK-O0-NEXT: mov r1, #2 +; CHECK-O0-NEXT: mov r2, #3 +; CHECK-O0-NEXT: mov r3, #4 +; CHECK-O0-NEXT: mov r10, r8 +; CHECK-O0-NEXT: bl _params_in_reg2 +; CHECK-O0-NEXT: ldr r10, [sp] @ 4-byte Reload +; CHECK-O0-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-O0-NEXT: ldr r1, [sp, #8] @ 4-byte Reload +; CHECK-O0-NEXT: ldr r2, [sp, #12] @ 4-byte Reload +; CHECK-O0-NEXT: ldr r3, [sp, #16] @ 4-byte Reload +; CHECK-O0-NEXT: mov r9, r8 +; CHECK-O0-NEXT: ldr r8, [sp, #20] @ 4-byte Reload +; CHECK-O0-NEXT: bl _params_in_reg2 +; CHECK-O0-NEXT: sub sp, r7, #4 +; CHECK-O0-NEXT: pop {r7, r10, pc} ; ; CHECK-ANDROID-LABEL: params_in_reg: ; CHECK-ANDROID: @ %bb.0: @@ -1153,63 +1153,63 @@ define swiftcc { i32, i32, i32, i32} @params_and_return_in_reg(i32, i32, i32, i3 ; ; CHECK-O0-LABEL: params_and_return_in_reg: ; CHECK-O0: @ %bb.0: -; CHECK-O0-NEXT: push {r7, r10, lr} -; CHECK-O0-NEXT: add r7, sp, #4 -; CHECK-O0-NEXT: sub sp, sp, #76 -; CHECK-O0-NEXT: bfc sp, #0, #3 -; CHECK-O0-NEXT: str r8, [sp, #24] @ 4-byte Spill -; CHECK-O0-NEXT: str r10, [sp, #4] @ 4-byte Spill -; CHECK-O0-NEXT: str r3, [sp, #20] @ 4-byte Spill -; CHECK-O0-NEXT: str r2, [sp, #16] @ 4-byte Spill -; CHECK-O0-NEXT: str r1, [sp, #12] @ 4-byte Spill -; CHECK-O0-NEXT: str r0, [sp, #8] @ 4-byte Spill +; CHECK-O0-NEXT: push {r7, r10, lr} +; CHECK-O0-NEXT: add r7, sp, #4 +; CHECK-O0-NEXT: sub sp, sp, #76 +; CHECK-O0-NEXT: bfc sp, #0, #3 +; CHECK-O0-NEXT: str r8, [sp, #24] @ 4-byte Spill +; CHECK-O0-NEXT: str r10, [sp, #4] @ 4-byte Spill +; CHECK-O0-NEXT: str r3, [sp, #20] @ 4-byte Spill +; CHECK-O0-NEXT: str r2, [sp, #16] @ 4-byte Spill +; CHECK-O0-NEXT: str r1, [sp, #12] @ 4-byte Spill +; CHECK-O0-NEXT: str r0, [sp, #8] @ 4-byte Spill ; CHECK-O0-NEXT: @ implicit-def: $r0 -; CHECK-O0-NEXT: mov r8, #0 -; CHECK-O0-NEXT: str r8, [sp, #28] @ 4-byte Spill -; CHECK-O0-NEXT: mov r0, #1 -; CHECK-O0-NEXT: str r0, [sp, #32] @ 4-byte Spill -; CHECK-O0-NEXT: mov r1, #2 -; CHECK-O0-NEXT: str r1, [sp, #36] @ 4-byte Spill -; CHECK-O0-NEXT: mov r2, #3 -; CHECK-O0-NEXT: str r2, [sp, #40] @ 4-byte Spill -; CHECK-O0-NEXT: mov r3, #4 -; CHECK-O0-NEXT: str r3, [sp, #44] @ 4-byte Spill -; CHECK-O0-NEXT: mov r10, r8 -; CHECK-O0-NEXT: bl _params_in_reg2 -; CHECK-O0-NEXT: ldr r10, [sp, #4] @ 4-byte Reload -; CHECK-O0-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-O0-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-O0-NEXT: ldr r2, [sp, #16] @ 4-byte Reload -; CHECK-O0-NEXT: ldr r3, [sp, #20] @ 4-byte Reload -; CHECK-O0-NEXT: mov r9, r8 -; CHECK-O0-NEXT: ldr r8, [sp, #24] @ 4-byte Reload -; CHECK-O0-NEXT: str r9, [sp, #48] @ 4-byte Spill -; CHECK-O0-NEXT: bl _params_and_return_in_reg2 -; CHECK-O0-NEXT: ldr r10, [sp, #28] @ 4-byte Reload -; CHECK-O0-NEXT: mov r9, r0 -; CHECK-O0-NEXT: ldr r0, [sp, #32] @ 4-byte Reload -; CHECK-O0-NEXT: str r9, [sp, #52] @ 4-byte Spill -; CHECK-O0-NEXT: mov r9, r1 -; CHECK-O0-NEXT: ldr r1, [sp, #36] @ 4-byte Reload -; CHECK-O0-NEXT: str r9, [sp, #56] @ 4-byte Spill -; CHECK-O0-NEXT: mov r9, r2 -; CHECK-O0-NEXT: ldr r2, [sp, #40] @ 4-byte Reload -; CHECK-O0-NEXT: str r9, [sp, #60] @ 4-byte Spill -; CHECK-O0-NEXT: mov r9, r3 -; CHECK-O0-NEXT: ldr r3, [sp, #44] @ 4-byte Reload -; CHECK-O0-NEXT: str r9, [sp, #64] @ 4-byte Spill -; CHECK-O0-NEXT: mov r9, r8 -; CHECK-O0-NEXT: ldr r8, [sp, #48] @ 4-byte Reload -; CHECK-O0-NEXT: str r9, [sp, #68] @ 4-byte Spill -; CHECK-O0-NEXT: bl _params_in_reg2 -; CHECK-O0-NEXT: ldr r0, [sp, #52] @ 4-byte Reload -; CHECK-O0-NEXT: ldr r1, [sp, #56] @ 4-byte Reload -; CHECK-O0-NEXT: ldr r2, [sp, #60] @ 4-byte Reload -; CHECK-O0-NEXT: ldr r3, [sp, #64] @ 4-byte Reload -; CHECK-O0-NEXT: mov r9, r8 -; CHECK-O0-NEXT: ldr r8, [sp, #68] @ 4-byte Reload -; CHECK-O0-NEXT: sub sp, r7, #4 -; CHECK-O0-NEXT: pop {r7, r10, pc} +; CHECK-O0-NEXT: mov r8, #0 +; CHECK-O0-NEXT: str r8, [sp, #28] @ 4-byte Spill +; CHECK-O0-NEXT: mov r0, #1 +; CHECK-O0-NEXT: str r0, [sp, #32] @ 4-byte Spill +; CHECK-O0-NEXT: mov r1, #2 +; CHECK-O0-NEXT: str r1, [sp, #36] @ 4-byte Spill +; CHECK-O0-NEXT: mov r2, #3 +; CHECK-O0-NEXT: str r2, [sp, #40] @ 4-byte Spill +; CHECK-O0-NEXT: mov r3, #4 +; CHECK-O0-NEXT: str r3, [sp, #44] @ 4-byte Spill +; CHECK-O0-NEXT: mov r10, r8 +; CHECK-O0-NEXT: bl _params_in_reg2 +; CHECK-O0-NEXT: ldr r10, [sp, #4] @ 4-byte Reload +; CHECK-O0-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-O0-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-O0-NEXT: ldr r2, [sp, #16] @ 4-byte Reload +; CHECK-O0-NEXT: ldr r3, [sp, #20] @ 4-byte Reload +; CHECK-O0-NEXT: mov r9, r8 +; CHECK-O0-NEXT: ldr r8, [sp, #24] @ 4-byte Reload +; CHECK-O0-NEXT: str r9, [sp, #48] @ 4-byte Spill +; CHECK-O0-NEXT: bl _params_and_return_in_reg2 +; CHECK-O0-NEXT: ldr r10, [sp, #28] @ 4-byte Reload +; CHECK-O0-NEXT: mov r9, r0 +; CHECK-O0-NEXT: ldr r0, [sp, #32] @ 4-byte Reload +; CHECK-O0-NEXT: str r9, [sp, #52] @ 4-byte Spill +; CHECK-O0-NEXT: mov r9, r1 +; CHECK-O0-NEXT: ldr r1, [sp, #36] @ 4-byte Reload +; CHECK-O0-NEXT: str r9, [sp, #56] @ 4-byte Spill +; CHECK-O0-NEXT: mov r9, r2 +; CHECK-O0-NEXT: ldr r2, [sp, #40] @ 4-byte Reload +; CHECK-O0-NEXT: str r9, [sp, #60] @ 4-byte Spill +; CHECK-O0-NEXT: mov r9, r3 +; CHECK-O0-NEXT: ldr r3, [sp, #44] @ 4-byte Reload +; CHECK-O0-NEXT: str r9, [sp, #64] @ 4-byte Spill +; CHECK-O0-NEXT: mov r9, r8 +; CHECK-O0-NEXT: ldr r8, [sp, #48] @ 4-byte Reload +; CHECK-O0-NEXT: str r9, [sp, #68] @ 4-byte Spill +; CHECK-O0-NEXT: bl _params_in_reg2 +; CHECK-O0-NEXT: ldr r0, [sp, #52] @ 4-byte Reload +; CHECK-O0-NEXT: ldr r1, [sp, #56] @ 4-byte Reload +; CHECK-O0-NEXT: ldr r2, [sp, #60] @ 4-byte Reload +; CHECK-O0-NEXT: ldr r3, [sp, #64] @ 4-byte Reload +; CHECK-O0-NEXT: mov r9, r8 +; CHECK-O0-NEXT: ldr r8, [sp, #68] @ 4-byte Reload +; CHECK-O0-NEXT: sub sp, r7, #4 +; CHECK-O0-NEXT: pop {r7, r10, pc} ; ; CHECK-ANDROID-LABEL: params_and_return_in_reg: ; CHECK-ANDROID: @ %bb.0: @@ -1325,17 +1325,17 @@ define swiftcc ptr @testAssign(ptr %error_ref) { ; ; CHECK-O0-LABEL: testAssign: ; CHECK-O0: @ %bb.0: @ %entry -; CHECK-O0-NEXT: push {r7, r8, lr} -; CHECK-O0-NEXT: add r7, sp, #4 -; CHECK-O0-NEXT: sub sp, sp, #8 +; CHECK-O0-NEXT: push {r7, r8, lr} +; CHECK-O0-NEXT: add r7, sp, #4 +; CHECK-O0-NEXT: sub sp, sp, #8 ; CHECK-O0-NEXT: @ implicit-def: $r1 -; CHECK-O0-NEXT: mov r8, #0 -; CHECK-O0-NEXT: bl _foo2 -; CHECK-O0-NEXT: str r8, [sp] @ 4-byte Spill +; CHECK-O0-NEXT: mov r8, #0 +; CHECK-O0-NEXT: bl _foo2 +; CHECK-O0-NEXT: str r8, [sp] @ 4-byte Spill ; CHECK-O0-NEXT: @ %bb.1: @ %a ; CHECK-O0-NEXT: ldr r0, [sp] @ 4-byte Reload ; CHECK-O0-NEXT: sub sp, r7, #4 -; CHECK-O0-NEXT: pop {r7, r8, pc} +; CHECK-O0-NEXT: pop {r7, r8, pc} ; ; CHECK-ANDROID-LABEL: testAssign: ; CHECK-ANDROID: @ %bb.0: @ %entry diff --git a/llvm/test/CodeGen/AVR/bug-81911.ll b/llvm/test/CodeGen/AVR/bug-81911.ll index 2a22666a1ff92..d3436e2da1d3d 100644 --- a/llvm/test/CodeGen/AVR/bug-81911.ll +++ b/llvm/test/CodeGen/AVR/bug-81911.ll @@ -41,31 +41,31 @@ define internal i8 @main() { ; CHECK-NEXT: adiw r24, 6 ; CHECK-NEXT: std Y+3, r25 ; 2-byte Folded Spill ; CHECK-NEXT: std Y+2, r24 ; 2-byte Folded Spill -; CHECK-NEXT: movw r8, r16 -; CHECK-NEXT: movw r6, r16 -; CHECK-NEXT: movw r4, r16 ; CHECK-NEXT: movw r2, r16 +; CHECK-NEXT: movw r4, r16 +; CHECK-NEXT: movw r6, r16 +; CHECK-NEXT: movw r8, r16 ; CHECK-NEXT: rjmp .LBB0_2 ; CHECK-NEXT: .LBB0_1: ; %bb1 ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: andi r30, 1 ; CHECK-NEXT: ldd r31, Y+4 ; 1-byte Folded Reload ; CHECK-NEXT: dec r31 +; CHECK-NEXT: movw r8, r24 +; CHECK-NEXT: movw r6, r22 +; CHECK-NEXT: movw r4, r20 +; CHECK-NEXT: movw r2, r18 ; CHECK-NEXT: cpi r30, 0 -; CHECK-NEXT: movw r8, r18 -; CHECK-NEXT: movw r6, r20 -; CHECK-NEXT: movw r4, r22 -; CHECK-NEXT: movw r2, r24 ; CHECK-NEXT: mov r18, r31 ; CHECK-NEXT: brne .LBB0_2 ; CHECK-NEXT: rjmp .LBB0_4 ; CHECK-NEXT: .LBB0_2: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: std Y+4, r18 ; 1-byte Folded Spill -; CHECK-NEXT: movw r18, r8 -; CHECK-NEXT: movw r20, r6 -; CHECK-NEXT: movw r22, r4 -; CHECK-NEXT: movw r24, r2 +; CHECK-NEXT: movw r18, r2 +; CHECK-NEXT: movw r20, r4 +; CHECK-NEXT: movw r22, r6 +; CHECK-NEXT: movw r24, r8 ; CHECK-NEXT: ldi r26, 10 ; CHECK-NEXT: ldi r27, 0 ; CHECK-NEXT: movw r10, r26 @@ -85,14 +85,14 @@ define internal i8 @main() { ; CHECK-NEXT: ;APP ; CHECK-NEXT: ;NO_APP ; CHECK-NEXT: ldi r30, 1 -; CHECK-NEXT: cp r8, r1 -; CHECK-NEXT: cpc r9, r1 -; CHECK-NEXT: cpc r6, r16 -; CHECK-NEXT: cpc r7, r17 +; CHECK-NEXT: cp r2, r1 +; CHECK-NEXT: cpc r3, r1 ; CHECK-NEXT: cpc r4, r16 ; CHECK-NEXT: cpc r5, r17 -; CHECK-NEXT: cpc r2, r16 -; CHECK-NEXT: cpc r3, r17 +; CHECK-NEXT: cpc r6, r16 +; CHECK-NEXT: cpc r7, r17 +; CHECK-NEXT: cpc r8, r16 +; CHECK-NEXT: cpc r9, r17 ; CHECK-NEXT: breq .LBB0_3 ; CHECK-NEXT: rjmp .LBB0_1 ; CHECK-NEXT: .LBB0_3: ; %bb1 diff --git a/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll b/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll index 006a8b6bfc94a..4d75a733526b0 100644 --- a/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll +++ b/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll @@ -1,7 +1,7 @@ ; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s ; This version of the conv3x3 test has both loops. This test checks that the -; inner loop has 14 packets. +; inner loop has 13 packets. ; CHECK: loop0(.LBB0_[[LOOP:.]], ; CHECK: .LBB0_[[LOOP]]: @@ -17,7 +17,6 @@ ; CHECK: } ; CHECK: } ; CHECK: } -; CHECK: } ; CHECK-NOT: } ; CHECK: }{{[ \t]*}}:endloop0 diff --git a/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll b/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll index 96a38939dc50e..b50290525002d 100644 --- a/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll +++ b/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll @@ -12,8 +12,8 @@ ; CHECK: [[EPLOG]]: ; CHECK: [[VREG1:v([0-9]+)]] = [[VREG]] ; CHECK: [[EPLOG1]]: -; CHECK: [[VREG2:v[0-9]+]] = [[VREG1]] -; CHECK: = vlalign([[VREG1]],[[VREG2]],#1) +; CHECK: [[VREG2:v[0-9]+]] = [[VREG]] +; CHECK: = vlalign([[VREG2]],[[VREG1]],#1) ; Function Attrs: nounwind define void @f0(ptr noalias nocapture readonly %a0, i32 %a1, i32 %a2, ptr noalias nocapture readonly %a3, i32 %a4, ptr noalias nocapture %a5, i32 %a6) #0 { diff --git a/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll b/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll index 42efe60b96d48..c4dbbcc5969ca 100644 --- a/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll +++ b/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll @@ -3,7 +3,7 @@ ; From coremark. Test that we pipeline the matrix multiplication bitextract ; function. The pipelined code should have two packets. -; CHECK: loop0(.LBB0_[[LOOP:.]], +; CHECK: loop0(.LBB0_[[LOOP:[0-9]+]], ; CHECK: .LBB0_[[LOOP]]: ; CHECK: [[REG0:(r[0-9]+)]] = mpyi([[REG1:(r[0-9]+)]],[[REG2:(r[0-9]+)]]) ; CHECK: += mpyi diff --git a/llvm/test/CodeGen/Hexagon/swp-stages4.ll b/llvm/test/CodeGen/Hexagon/swp-stages4.ll index 0d029dc7d2f2e..bddf9cebe7160 100644 --- a/llvm/test/CodeGen/Hexagon/swp-stages4.ll +++ b/llvm/test/CodeGen/Hexagon/swp-stages4.ll @@ -3,11 +3,8 @@ ; Test that we rename registers correctly for multiple stages when there is a ; Phi and depends upon another Phi. -; CHECK: = and -; CHECK: = and -; CHECK: r[[REGA:[0-9]+]] = memub(r{{[0-9]+}}+#1) -; CHECK: = and -; CHECK: r[[REG0:[0-9]+]] = and(r[[REG1:[0-9]+]],#255) +; CHECK: jump +; CHECK-NEXT: r[[REG0:[0-9]+]] = and(r[[REG1:[0-9]+]],#255) ; CHECK-NOT: r[[REG0]] = and(r[[REG1]],#255) ; CHECK: loop0(.LBB0_[[LOOP:.]], ; CHECK: .LBB0_[[LOOP]]: diff --git a/llvm/test/CodeGen/Hexagon/tinycore.ll b/llvm/test/CodeGen/Hexagon/tinycore.ll index c44038e767194..b20a7831df4d8 100644 --- a/llvm/test/CodeGen/Hexagon/tinycore.ll +++ b/llvm/test/CodeGen/Hexagon/tinycore.ll @@ -8,10 +8,15 @@ ; CHECK: .LBB0_[[LOOP]]: ; CHECK: { ; CHECK-NEXT: mpy -; CHECK-NEXT: combine -; CHECK-NEXT: memw -; CHECK-NEXT: } +; CHECK-NOT: memw +; CHECK: } +; CHECK: { +; CHECK: memw +; CHECK-NOT: memw +; CHECK: } +; CHECK: { ; CHECK: memw +; CHECK-NOT: memw ; CHECK: } :endloop0 ; Test the loop contains a single packet with 4 instructions. diff --git a/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll b/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll index 9142e718e8adc..06edb736e0435 100644 --- a/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll +++ b/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll @@ -39,17 +39,17 @@ define void @test_la_pcrel(i32 signext %n) { ; ; LA64LARGE-LABEL: test_la_pcrel: ; LA64LARGE: # %bb.0: # %entry -; LA64LARGE-NEXT: pcalau12i $a1, %pc_hi20(l) -; LA64LARGE-NEXT: addi.d $a2, $zero, %pc_lo12(l) -; LA64LARGE-NEXT: lu32i.d $a2, %pc64_lo20(l) -; LA64LARGE-NEXT: lu52i.d $a2, $a2, %pc64_hi12(l) -; LA64LARGE-NEXT: move $a3, $zero +; LA64LARGE-NEXT: move $a1, $zero +; LA64LARGE-NEXT: pcalau12i $a2, %pc_hi20(l) +; LA64LARGE-NEXT: addi.d $a3, $zero, %pc_lo12(l) +; LA64LARGE-NEXT: lu32i.d $a3, %pc64_lo20(l) +; LA64LARGE-NEXT: lu52i.d $a3, $a3, %pc64_hi12(l) ; LA64LARGE-NEXT: .p2align 4, , 16 ; LA64LARGE-NEXT: .LBB0_1: # %loop ; LA64LARGE-NEXT: # =>This Inner Loop Header: Depth=1 -; LA64LARGE-NEXT: ldx.w $zero, $a2, $a1 -; LA64LARGE-NEXT: addi.w $a3, $a3, 1 -; LA64LARGE-NEXT: blt $a3, $a0, .LBB0_1 +; LA64LARGE-NEXT: ldx.w $zero, $a3, $a2 +; LA64LARGE-NEXT: addi.w $a1, $a1, 1 +; LA64LARGE-NEXT: blt $a1, $a0, .LBB0_1 ; LA64LARGE-NEXT: # %bb.2: # %ret ; LA64LARGE-NEXT: ret entry: @@ -99,18 +99,18 @@ define void @test_la_got(i32 signext %n) { ; ; LA64LARGE-LABEL: test_la_got: ; LA64LARGE: # %bb.0: # %entry -; LA64LARGE-NEXT: pcalau12i $a1, %got_pc_hi20(g) -; LA64LARGE-NEXT: addi.d $a2, $zero, %got_pc_lo12(g) -; LA64LARGE-NEXT: lu32i.d $a2, %got64_pc_lo20(g) -; LA64LARGE-NEXT: lu52i.d $a2, $a2, %got64_pc_hi12(g) -; LA64LARGE-NEXT: ldx.d $a1, $a2, $a1 -; LA64LARGE-NEXT: move $a2, $zero +; LA64LARGE-NEXT: move $a1, $zero +; LA64LARGE-NEXT: pcalau12i $a2, %got_pc_hi20(g) +; LA64LARGE-NEXT: addi.d $a3, $zero, %got_pc_lo12(g) +; LA64LARGE-NEXT: lu32i.d $a3, %got64_pc_lo20(g) +; LA64LARGE-NEXT: lu52i.d $a3, $a3, %got64_pc_hi12(g) +; LA64LARGE-NEXT: ldx.d $a2, $a3, $a2 ; LA64LARGE-NEXT: .p2align 4, , 16 ; LA64LARGE-NEXT: .LBB1_1: # %loop ; LA64LARGE-NEXT: # =>This Inner Loop Header: Depth=1 -; LA64LARGE-NEXT: ld.w $zero, $a1, 0 -; LA64LARGE-NEXT: addi.w $a2, $a2, 1 -; LA64LARGE-NEXT: blt $a2, $a0, .LBB1_1 +; LA64LARGE-NEXT: ld.w $zero, $a2, 0 +; LA64LARGE-NEXT: addi.w $a1, $a1, 1 +; LA64LARGE-NEXT: blt $a1, $a0, .LBB1_1 ; LA64LARGE-NEXT: # %bb.2: # %ret ; LA64LARGE-NEXT: ret entry: @@ -161,18 +161,18 @@ define void @test_la_tls_ie(i32 signext %n) { ; ; LA64LARGE-LABEL: test_la_tls_ie: ; LA64LARGE: # %bb.0: # %entry -; LA64LARGE-NEXT: pcalau12i $a1, %ie_pc_hi20(ie) -; LA64LARGE-NEXT: addi.d $a2, $zero, %ie_pc_lo12(ie) -; LA64LARGE-NEXT: lu32i.d $a2, %ie64_pc_lo20(ie) -; LA64LARGE-NEXT: lu52i.d $a2, $a2, %ie64_pc_hi12(ie) -; LA64LARGE-NEXT: ldx.d $a1, $a2, $a1 -; LA64LARGE-NEXT: move $a2, $zero +; LA64LARGE-NEXT: move $a1, $zero +; LA64LARGE-NEXT: pcalau12i $a2, %ie_pc_hi20(ie) +; LA64LARGE-NEXT: addi.d $a3, $zero, %ie_pc_lo12(ie) +; LA64LARGE-NEXT: lu32i.d $a3, %ie64_pc_lo20(ie) +; LA64LARGE-NEXT: lu52i.d $a3, $a3, %ie64_pc_hi12(ie) +; LA64LARGE-NEXT: ldx.d $a2, $a3, $a2 ; LA64LARGE-NEXT: .p2align 4, , 16 ; LA64LARGE-NEXT: .LBB2_1: # %loop ; LA64LARGE-NEXT: # =>This Inner Loop Header: Depth=1 -; LA64LARGE-NEXT: ldx.w $zero, $a1, $tp -; LA64LARGE-NEXT: addi.w $a2, $a2, 1 -; LA64LARGE-NEXT: blt $a2, $a0, .LBB2_1 +; LA64LARGE-NEXT: ldx.w $zero, $a2, $tp +; LA64LARGE-NEXT: addi.w $a1, $a1, 1 +; LA64LARGE-NEXT: blt $a1, $a0, .LBB2_1 ; LA64LARGE-NEXT: # %bb.2: # %ret ; LA64LARGE-NEXT: ret entry: @@ -270,11 +270,11 @@ define void @test_la_tls_ld(i32 signext %n) { ; LA64LARGE-NEXT: .cfi_offset 23, -24 ; LA64LARGE-NEXT: .cfi_offset 24, -32 ; LA64LARGE-NEXT: move $fp, $a0 +; LA64LARGE-NEXT: move $s1, $zero ; LA64LARGE-NEXT: pcalau12i $a0, %ld_pc_hi20(ld) ; LA64LARGE-NEXT: addi.d $a1, $zero, %got_pc_lo12(ld) ; LA64LARGE-NEXT: lu32i.d $a1, %got64_pc_lo20(ld) ; LA64LARGE-NEXT: lu52i.d $a1, $a1, %got64_pc_hi12(ld) -; LA64LARGE-NEXT: move $s1, $zero ; LA64LARGE-NEXT: add.d $s0, $a1, $a0 ; LA64LARGE-NEXT: .p2align 4, , 16 ; LA64LARGE-NEXT: .LBB3_1: # %loop @@ -436,11 +436,11 @@ define void @test_la_tls_gd(i32 signext %n) nounwind { ; LA64LARGE-NEXT: st.d $s0, $sp, 8 # 8-byte Folded Spill ; LA64LARGE-NEXT: st.d $s1, $sp, 0 # 8-byte Folded Spill ; LA64LARGE-NEXT: move $fp, $a0 +; LA64LARGE-NEXT: move $s1, $zero ; LA64LARGE-NEXT: pcalau12i $a0, %gd_pc_hi20(gd) ; LA64LARGE-NEXT: addi.d $a1, $zero, %got_pc_lo12(gd) ; LA64LARGE-NEXT: lu32i.d $a1, %got64_pc_lo20(gd) ; LA64LARGE-NEXT: lu52i.d $a1, $a1, %got64_pc_hi12(gd) -; LA64LARGE-NEXT: move $s1, $zero ; LA64LARGE-NEXT: add.d $s0, $a1, $a0 ; LA64LARGE-NEXT: .p2align 4, , 16 ; LA64LARGE-NEXT: .LBB5_1: # %loop diff --git a/llvm/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.mir b/llvm/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.mir index 8bdf719f4bb5b..59f1477b5c37f 100644 --- a/llvm/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.mir +++ b/llvm/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.mir @@ -80,10 +80,9 @@ body: | # CHECK-NEXT: %15:g8rc = COPY killed %6 # CHECK: bb.3: # CHECK: %10:g8rc = COPY killed %15 -# CHECK-NEXT: %9:g8rc = COPY killed %14 +# CHECK-NEXT: %16:g8rc_and_g8rc_nox0 = COPY killed %14 # CHECK-NEXT: %14:g8rc = COPY killed %10 # CHECK-NEXT: %15:g8rc = IMPLICIT_DEF -# CHECK-NEXT: %16:g8rc_and_g8rc_nox0 = COPY killed %9 # CHECK-NEXT: BCC 68, %7, %bb.3 # CHECK-NEXT: B %bb.4 # CHECK: bb.4: diff --git a/llvm/test/CodeGen/PowerPC/check-zero-vector.ll b/llvm/test/CodeGen/PowerPC/check-zero-vector.ll index e6367e65d7200..4f8d5fca2b543 100644 --- a/llvm/test/CodeGen/PowerPC/check-zero-vector.ll +++ b/llvm/test/CodeGen/PowerPC/check-zero-vector.ll @@ -442,8 +442,8 @@ define i32 @test_Greater_than(ptr %colauths, i32 signext %ncols) { ; POWERPC_32-NEXT: cmplwi 4, 7 ; POWERPC_32-NEXT: bgt 0, L..BB0_4 ; POWERPC_32-NEXT: # %bb.2: -; POWERPC_32-NEXT: li 7, 0 ; POWERPC_32-NEXT: li 6, 0 +; POWERPC_32-NEXT: li 7, 0 ; POWERPC_32-NEXT: li 5, 0 ; POWERPC_32-NEXT: b L..BB0_13 ; POWERPC_32-NEXT: L..BB0_3: @@ -458,13 +458,13 @@ define i32 @test_Greater_than(ptr %colauths, i32 signext %ncols) { ; POWERPC_32-NEXT: ori 8, 6, 65472 ; POWERPC_32-NEXT: bge 0, L..BB0_6 ; POWERPC_32-NEXT: # %bb.5: -; POWERPC_32-NEXT: li 7, 0 ; POWERPC_32-NEXT: li 6, 0 +; POWERPC_32-NEXT: li 7, 0 ; POWERPC_32-NEXT: b L..BB0_10 ; POWERPC_32-NEXT: L..BB0_6: # %vector.ph -; POWERPC_32-NEXT: and 6, 4, 8 +; POWERPC_32-NEXT: and 7, 4, 8 ; POWERPC_32-NEXT: xxlxor 35, 35, 35 -; POWERPC_32-NEXT: li 7, 0 +; POWERPC_32-NEXT: li 6, 0 ; POWERPC_32-NEXT: li 9, 0 ; POWERPC_32-NEXT: mr 10, 3 ; POWERPC_32-NEXT: xxlxor 36, 36, 36 @@ -489,7 +489,7 @@ define i32 @test_Greater_than(ptr %colauths, i32 signext %ncols) { ; POWERPC_32-NEXT: lxv 50, 0(10) ; POWERPC_32-NEXT: addic 9, 9, 64 ; POWERPC_32-NEXT: addze 5, 5 -; POWERPC_32-NEXT: xor 11, 9, 6 +; POWERPC_32-NEXT: xor 11, 9, 7 ; POWERPC_32-NEXT: or. 11, 11, 5 ; POWERPC_32-NEXT: vcmpequh 18, 18, 3 ; POWERPC_32-NEXT: xxlnor 50, 50, 50 @@ -567,7 +567,7 @@ define i32 @test_Greater_than(ptr %colauths, i32 signext %ncols) { ; POWERPC_32-NEXT: # %bb.8: # %middle.block ; POWERPC_32-NEXT: vadduwm 3, 7, 4 ; POWERPC_32-NEXT: vadduwm 4, 8, 5 -; POWERPC_32-NEXT: xor. 9, 6, 4 +; POWERPC_32-NEXT: xor. 9, 7, 4 ; POWERPC_32-NEXT: vadduwm 4, 13, 4 ; POWERPC_32-NEXT: vadduwm 3, 0, 3 ; POWERPC_32-NEXT: vadduwm 3, 9, 3 @@ -595,23 +595,22 @@ define i32 @test_Greater_than(ptr %colauths, i32 signext %ncols) { ; POWERPC_32-NEXT: stw 5, -32(1) ; POWERPC_32-NEXT: lwz 5, L..C0(2) # %const.0 ; POWERPC_32-NEXT: xxlxor 36, 36, 36 -; POWERPC_32-NEXT: mr 9, 6 +; POWERPC_32-NEXT: addi 8, 8, 56 ; POWERPC_32-NEXT: lxv 1, -32(1) -; POWERPC_32-NEXT: addi 6, 8, 56 -; POWERPC_32-NEXT: and 6, 4, 6 ; POWERPC_32-NEXT: xxlxor 35, 35, 35 ; POWERPC_32-NEXT: xxlxor 37, 37, 37 ; POWERPC_32-NEXT: lxv 0, 0(5) +; POWERPC_32-NEXT: and 8, 4, 8 ; POWERPC_32-NEXT: xxperm 36, 1, 0 ; POWERPC_32-NEXT: .align 4 ; POWERPC_32-NEXT: L..BB0_11: # %vec.epilog.vector.body ; POWERPC_32-NEXT: # -; POWERPC_32-NEXT: slwi 5, 9, 1 -; POWERPC_32-NEXT: addic 9, 9, 8 -; POWERPC_32-NEXT: addze 7, 7 +; POWERPC_32-NEXT: slwi 5, 7, 1 +; POWERPC_32-NEXT: addic 7, 7, 8 +; POWERPC_32-NEXT: addze 6, 6 ; POWERPC_32-NEXT: lxvx 32, 3, 5 -; POWERPC_32-NEXT: xor 5, 9, 6 -; POWERPC_32-NEXT: or. 5, 5, 7 +; POWERPC_32-NEXT: xor 5, 7, 8 +; POWERPC_32-NEXT: or. 5, 5, 6 ; POWERPC_32-NEXT: vcmpequh 0, 0, 3 ; POWERPC_32-NEXT: xxlnor 32, 32, 32 ; POWERPC_32-NEXT: vmrghh 1, 0, 0 @@ -623,9 +622,10 @@ define i32 @test_Greater_than(ptr %colauths, i32 signext %ncols) { ; POWERPC_32-NEXT: bne 0, L..BB0_11 ; POWERPC_32-NEXT: # %bb.12: # %vec.epilog.middle.block ; POWERPC_32-NEXT: vadduwm 2, 4, 5 -; POWERPC_32-NEXT: xor. 7, 6, 4 -; POWERPC_32-NEXT: li 7, 0 +; POWERPC_32-NEXT: xor. 6, 8, 4 +; POWERPC_32-NEXT: li 6, 0 ; POWERPC_32-NEXT: xxswapd 35, 34 +; POWERPC_32-NEXT: mr 7, 8 ; POWERPC_32-NEXT: vadduwm 2, 2, 3 ; POWERPC_32-NEXT: xxspltw 35, 34, 1 ; POWERPC_32-NEXT: vadduwm 2, 2, 3 @@ -633,21 +633,21 @@ define i32 @test_Greater_than(ptr %colauths, i32 signext %ncols) { ; POWERPC_32-NEXT: lwz 5, -48(1) ; POWERPC_32-NEXT: beq 0, L..BB0_15 ; POWERPC_32-NEXT: L..BB0_13: # %for.body.preheader -; POWERPC_32-NEXT: slwi 8, 6, 1 +; POWERPC_32-NEXT: slwi 8, 7, 1 ; POWERPC_32-NEXT: add 3, 8, 3 ; POWERPC_32-NEXT: addi 3, 3, -2 ; POWERPC_32-NEXT: .align 4 ; POWERPC_32-NEXT: L..BB0_14: # %for.body ; POWERPC_32-NEXT: # ; POWERPC_32-NEXT: lhzu 8, 2(3) -; POWERPC_32-NEXT: addic 6, 6, 1 -; POWERPC_32-NEXT: addze 7, 7 +; POWERPC_32-NEXT: addic 7, 7, 1 +; POWERPC_32-NEXT: addze 6, 6 ; POWERPC_32-NEXT: cntlzw 8, 8 ; POWERPC_32-NEXT: not 8, 8 ; POWERPC_32-NEXT: rlwinm 8, 8, 27, 31, 31 ; POWERPC_32-NEXT: add 5, 5, 8 -; POWERPC_32-NEXT: xor 8, 6, 4 -; POWERPC_32-NEXT: or. 8, 8, 7 +; POWERPC_32-NEXT: xor 8, 7, 4 +; POWERPC_32-NEXT: or. 8, 8, 6 ; POWERPC_32-NEXT: bne 0, L..BB0_14 ; POWERPC_32-NEXT: L..BB0_15: # %for.cond.cleanup ; POWERPC_32-NEXT: mr 3, 5 diff --git a/llvm/test/CodeGen/PowerPC/disable-ctr-ppcf128.ll b/llvm/test/CodeGen/PowerPC/disable-ctr-ppcf128.ll index cd5ea16d4600b..cd2fbdfe71263 100644 --- a/llvm/test/CodeGen/PowerPC/disable-ctr-ppcf128.ll +++ b/llvm/test/CodeGen/PowerPC/disable-ctr-ppcf128.ll @@ -51,9 +51,9 @@ define ppc_fp128 @test_ctr0() { ; P9BE-NEXT: .cfi_offset r30, -16 ; P9BE-NEXT: li r3, 1 ; P9BE-NEXT: std r30, 112(r1) # 8-byte Folded Spill -; P9BE-NEXT: xxlxor f1, f1, f1 -; P9BE-NEXT: rldic r30, r3, 62, 1 ; P9BE-NEXT: xxlxor f2, f2, f2 +; P9BE-NEXT: rldic r30, r3, 62, 1 +; P9BE-NEXT: xxlxor f1, f1, f1 ; P9BE-NEXT: .p2align 5 ; P9BE-NEXT: .LBB0_1: # %bb6 ; P9BE-NEXT: # @@ -111,8 +111,8 @@ define ppc_fp128 @test_ctr0() { ; P8BE-NEXT: .cfi_offset r30, -16 ; P8BE-NEXT: li r3, 1 ; P8BE-NEXT: std r30, 112(r1) # 8-byte Folded Spill -; P8BE-NEXT: xxlxor f1, f1, f1 ; P8BE-NEXT: xxlxor f2, f2, f2 +; P8BE-NEXT: xxlxor f1, f1, f1 ; P8BE-NEXT: rldic r30, r3, 62, 1 ; P8BE-NEXT: .p2align 5 ; P8BE-NEXT: .LBB0_1: # %bb6 diff --git a/llvm/test/CodeGen/PowerPC/phi-eliminate.mir b/llvm/test/CodeGen/PowerPC/phi-eliminate.mir index 72f778286abe4..a4b18e648b7a2 100644 --- a/llvm/test/CodeGen/PowerPC/phi-eliminate.mir +++ b/llvm/test/CodeGen/PowerPC/phi-eliminate.mir @@ -195,12 +195,9 @@ body: | ; CHECK: bb.4: ; CHECK: successors: %bb.5(0x80000000) ; CHECK: %44:g8rc_and_g8rc_nox0 = COPY killed %59 - ; CHECK: %43:gprc = COPY killed %57 - ; CHECK: %41:gprc = COPY killed %60 - ; CHECK: %39:g8rc = COPY killed %44 - ; CHECK: %61:gprc = COPY killed %41 - ; CHECK: %62:g8rc_and_g8rc_nox0 = COPY killed %39 - ; CHECK: %63:gprc = COPY killed %43 + ; CHECK: %63:gprc = COPY killed %57 + ; CHECK: %61:gprc = COPY killed %60 + ; CHECK: %62:g8rc_and_g8rc_nox0 = COPY killed %44 ; CHECK: bb.5: ; CHECK: successors: %bb.6(0x80000000) diff --git a/llvm/test/CodeGen/PowerPC/ppcf128-freeze.mir b/llvm/test/CodeGen/PowerPC/ppcf128-freeze.mir index 474c288bba88b..4cad98eeade77 100644 --- a/llvm/test/CodeGen/PowerPC/ppcf128-freeze.mir +++ b/llvm/test/CodeGen/PowerPC/ppcf128-freeze.mir @@ -1,21 +1,10 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple powerpc64le-unknown-linux-gnu -start-after=codegenprepare \ # RUN: -o - %s -verify-machineinstrs | FileCheck %s --- | define ppc_fp128 @freeze_select(ppc_fp128 %a, ppc_fp128 %b) { - %sel.frozen = freeze ppc_fp128 %a - %cmp = fcmp one ppc_fp128 %sel.frozen, 0xM00000000000000000000000000000000 - br i1 %cmp, label %select.end, label %select.false - - select.false: ; preds = %0 - br label %select.end - - select.end: ; preds = %0, %select.false - %sel = phi ppc_fp128 [ %a, %0 ], [ %b, %select.false ] - ret ppc_fp128 %sel - } - - ; CHECK-LABEL: freeze_select + ; CHECK-LABEL: freeze_select: ; CHECK: # %bb.0: ; CHECK-NEXT: xxlxor 0, 0, 0 ; CHECK-NEXT: fcmpu 1, 2, 2 @@ -28,8 +17,19 @@ ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: crnor 20, 7, 2 ; CHECK-NEXT: bclr 12, 20, 0 - ; CHECK-NEXT: # %bb.2: # %select.false - ; CHECK-NEXT: fmr 1, 3 + ; CHECK-NEXT: # %bb.2: # %select.false ; CHECK-NEXT: fmr 2, 4 + ; CHECK-NEXT: fmr 1, 3 ; CHECK-NEXT: blr + %sel.frozen = freeze ppc_fp128 %a + %cmp = fcmp one ppc_fp128 %sel.frozen, 0xM00000000000000000000000000000000 + br i1 %cmp, label %select.end, label %select.false + + select.false: ; preds = %0 + br label %select.end + + select.end: ; preds = %0, %select.false + %sel = phi ppc_fp128 [ %a, %0 ], [ %b, %select.false ] + ret ppc_fp128 %sel + } ... diff --git a/llvm/test/CodeGen/PowerPC/pr116071.ll b/llvm/test/CodeGen/PowerPC/pr116071.ll index 29f11fc1d3a63..5db84436c22f6 100644 --- a/llvm/test/CodeGen/PowerPC/pr116071.ll +++ b/llvm/test/CodeGen/PowerPC/pr116071.ll @@ -1,9 +1,26 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -disable-ppc-vsx-fma-mutation=false -mcpu=pwr10 -verify-machineinstrs \ -; RUN: -ppc-asm-full-reg-names -mtriple powerpc64-ibm-aix7.2.0.0 < %s | FileCheck %s +; RUN: -ppc-asm-full-reg-names -mtriple powerpc64-ibm-aix7.2.0.0 < %s | FileCheck %s target datalayout = "E-m:a-Fi64-i64:64-n32:64-S128-v256:256:256-v512:512:512" define void @initial(<2 x double> %0){ +; CHECK-LABEL: initial: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxlxor vs0, vs0, vs0 +; CHECK-NEXT: xxlxor f2, f2, f2 +; CHECK-NEXT: xxlxor f4, f4, f4 +; CHECK-NEXT: xxlxor f3, f3, f3 +; CHECK-NEXT: xvmuldp vs1, vs34, vs0 +; CHECK-NEXT: .align 5 +; CHECK-NEXT: L..BB0_1: # %for.cond251.preheader.lr.ph +; CHECK-NEXT: # +; CHECK-NEXT: fmr f5, f3 +; CHECK-NEXT: xsadddp f3, f3, f4 +; CHECK-NEXT: fmr f4, f5 +; CHECK-NEXT: xxmrghd vs3, vs3, vs2 +; CHECK-NEXT: xvmaddmdp vs3, vs0, vs1 +; CHECK-NEXT: b L..BB0_1 entry: %1 = fmul <2 x double> %0, zeroinitializer br label %for.cond251.preheader.lr.ph @@ -18,9 +35,3 @@ for.cond251.preheader.lr.ph: ; preds = %for.cond251.prehead %7 = extractelement <2 x double> %6, i64 0 br label %for.cond251.preheader.lr.ph } - -; CHECK: xsadddp f4, f3, f4 -; CHECK-NEXT: xxmrghd vs5, vs4, vs2 -; CHECK-NEXT: fmr f4, f3 -; CHECK-NEXT: xvmaddmdp vs5, vs0, vs1 -; CHECK-NEXT: fmr f3, f5 diff --git a/llvm/test/CodeGen/PowerPC/sms-phi-2.ll b/llvm/test/CodeGen/PowerPC/sms-phi-2.ll index 4904d11fc8104..0077673292ab3 100644 --- a/llvm/test/CodeGen/PowerPC/sms-phi-2.ll +++ b/llvm/test/CodeGen/PowerPC/sms-phi-2.ll @@ -9,7 +9,7 @@ define void @phi2(i32, i32, ptr) local_unnamed_addr { ; CHECK-NEXT: li 5, 55 ; CHECK-NEXT: li 6, 48 ; CHECK-NEXT: mtctr 3 -; CHECK-NEXT: bdz .LBB0_4 +; CHECK-NEXT: bdz .LBB0_3 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: divw 9, 8, 4 ; CHECK-NEXT: mullw 7, 8, 4 @@ -19,7 +19,7 @@ define void @phi2(i32, i32, ptr) local_unnamed_addr { ; CHECK-NEXT: add 3, 7, 3 ; CHECK-NEXT: stbu 3, -1(7) ; CHECK-NEXT: mr 3, 8 -; CHECK-NEXT: bdz .LBB0_3 +; CHECK-NEXT: bdz .LBB0_4 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: ; CHECK-NEXT: mr 3, 9 @@ -33,13 +33,12 @@ define void @phi2(i32, i32, ptr) local_unnamed_addr { ; CHECK-NEXT: stbu 8, -1(7) ; CHECK-NEXT: mr 8, 3 ; CHECK-NEXT: bdnz .LBB0_2 +; CHECK-NEXT: b .LBB0_4 ; CHECK-NEXT: .LBB0_3: -; CHECK-NEXT: mr 8, 9 -; CHECK-NEXT: b .LBB0_5 -; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: # implicit-def: $x7 -; CHECK-NEXT: .LBB0_5: -; CHECK-NEXT: mullw 4, 8, 4 +; CHECK-NEXT: mr 9, 8 +; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: mullw 4, 9, 4 ; CHECK-NEXT: sub 3, 3, 4 ; CHECK-NEXT: cmplwi 3, 10 ; CHECK-NEXT: isellt 4, 6, 5 diff --git a/llvm/test/CodeGen/PowerPC/sms-phi-3.ll b/llvm/test/CodeGen/PowerPC/sms-phi-3.ll index 628822edabf39..2dd8b36389c62 100644 --- a/llvm/test/CodeGen/PowerPC/sms-phi-3.ll +++ b/llvm/test/CodeGen/PowerPC/sms-phi-3.ll @@ -19,34 +19,34 @@ define void @phi3(ptr) nounwind { ; CHECK-NEXT: mr 29, 3 ; CHECK-NEXT: bl malloc ; CHECK-NEXT: nop -; CHECK-NEXT: addi 7, 30, -4 +; CHECK-NEXT: addi 6, 30, -4 ; CHECK-NEXT: mtctr 3 ; CHECK-NEXT: addi 4, 29, -8 ; CHECK-NEXT: li 5, 0 -; CHECK-NEXT: lwzu 8, 4(7) +; CHECK-NEXT: lwzu 8, 4(6) ; CHECK-NEXT: bdz .LBB0_5 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: extswsli 6, 5, 5 +; CHECK-NEXT: extswsli 7, 5, 5 ; CHECK-NEXT: add 5, 8, 5 -; CHECK-NEXT: lwzu 8, 4(7) +; CHECK-NEXT: lwzu 8, 4(6) ; CHECK-NEXT: bdz .LBB0_4 ; CHECK-NEXT: # %bb.2: -; CHECK-NEXT: add 6, 3, 6 -; CHECK-NEXT: stdu 6, 8(4) -; CHECK-NEXT: extswsli 6, 5, 5 +; CHECK-NEXT: add 7, 3, 7 +; CHECK-NEXT: stdu 7, 8(4) +; CHECK-NEXT: extswsli 7, 5, 5 ; CHECK-NEXT: add 5, 8, 5 -; CHECK-NEXT: lwzu 8, 4(7) +; CHECK-NEXT: lwzu 8, 4(6) ; CHECK-NEXT: bdz .LBB0_4 ; CHECK-NEXT: .p2align 5 ; CHECK-NEXT: .LBB0_3: -; CHECK-NEXT: add 9, 3, 6 -; CHECK-NEXT: extswsli 6, 5, 5 +; CHECK-NEXT: add 9, 3, 7 +; CHECK-NEXT: extswsli 7, 5, 5 ; CHECK-NEXT: add 5, 8, 5 -; CHECK-NEXT: lwzu 8, 4(7) +; CHECK-NEXT: lwzu 8, 4(6) ; CHECK-NEXT: stdu 9, 8(4) ; CHECK-NEXT: bdnz .LBB0_3 ; CHECK-NEXT: .LBB0_4: -; CHECK-NEXT: add 6, 3, 6 +; CHECK-NEXT: add 6, 3, 7 ; CHECK-NEXT: stdu 6, 8(4) ; CHECK-NEXT: .LBB0_5: ; CHECK-NEXT: extswsli 5, 5, 5 diff --git a/llvm/test/CodeGen/PowerPC/stack-restore-with-setjmp.ll b/llvm/test/CodeGen/PowerPC/stack-restore-with-setjmp.ll index e225e63980c7f..eec5b4588f7c3 100644 --- a/llvm/test/CodeGen/PowerPC/stack-restore-with-setjmp.ll +++ b/llvm/test/CodeGen/PowerPC/stack-restore-with-setjmp.ll @@ -16,13 +16,12 @@ define dso_local signext i32 @main(i32 signext %argc, ptr nocapture readnone %ar ; CHECK-NEXT: stw 12, 8(1) ; CHECK-NEXT: mflr 0 ; CHECK-NEXT: stdu 1, -784(1) -; CHECK-NEXT: # kill: def $r3 killed $r3 killed $x3 -; CHECK-NEXT: cmpwi 2, 3, 2 -; CHECK-NEXT: li 4, 0 -; CHECK-NEXT: # kill: def $r4 killed $r4 killed $x4 -; CHECK-NEXT: mr 3, 4 +; CHECK-NEXT: mr 4, 3 ; CHECK-NEXT: std 0, 800(1) ; CHECK-NEXT: mr 31, 1 +; CHECK-NEXT: li 3, 0 +; CHECK-NEXT: # kill: def $r3 killed $r3 killed $x3 +; CHECK-NEXT: cmpwi 2, 4, 2 ; CHECK-NEXT: blt 2, .LBB0_3 ; CHECK-NEXT: # %bb.1: # %if.end ; CHECK-NEXT: addi 3, 31, 112 @@ -66,7 +65,6 @@ define dso_local signext i32 @main(i32 signext %argc, ptr nocapture readnone %ar ; BE-NEXT: stdu 1, -800(1) ; BE-NEXT: li 4, 0 ; BE-NEXT: # kill: def $r3 killed $r3 killed $x3 -; BE-NEXT: # kill: def $r4 killed $r4 killed $x4 ; BE-NEXT: cmpwi 2, 3, 2 ; BE-NEXT: mr 3, 4 ; BE-NEXT: std 0, 816(1) diff --git a/llvm/test/CodeGen/PowerPC/subreg-postra-2.ll b/llvm/test/CodeGen/PowerPC/subreg-postra-2.ll index f696745c9d414..10fa8221778f5 100644 --- a/llvm/test/CodeGen/PowerPC/subreg-postra-2.ll +++ b/llvm/test/CodeGen/PowerPC/subreg-postra-2.ll @@ -60,27 +60,27 @@ define void @jbd2_journal_commit_transaction(i32 %input1, ptr %input2, ptr %inpu ; CHECK-NO-ISEL-NEXT: bne- 0, .Ltmp0 ; CHECK-NO-ISEL-EMPTY: ; CHECK-NO-ISEL-NEXT: #NO_APP -; CHECK-NO-ISEL-NEXT: std 5, 0(6) +; CHECK-NO-ISEL-NEXT: std 4, 0(6) ; CHECK-NO-ISEL-NEXT: beq- 5, .LBB0_6 ; CHECK-NO-ISEL-NEXT: .LBB0_2: # %while.body392 ; CHECK-NO-ISEL-NEXT: # ; CHECK-NO-ISEL-NEXT: bne- 1, .LBB0_5 ; CHECK-NO-ISEL-NEXT: # %bb.3: # %wait_on_buffer.exit1319 ; CHECK-NO-ISEL-NEXT: # -; CHECK-NO-ISEL-NEXT: ld 5, 0(6) -; CHECK-NO-ISEL-NEXT: mr 9, 5 -; CHECK-NO-ISEL-NEXT: ldu 4, -72(9) -; CHECK-NO-ISEL-NEXT: andi. 4, 4, 1 -; CHECK-NO-ISEL-NEXT: mr 4, 3 +; CHECK-NO-ISEL-NEXT: ld 4, 0(6) +; CHECK-NO-ISEL-NEXT: mr 9, 4 +; CHECK-NO-ISEL-NEXT: ldu 5, -72(9) +; CHECK-NO-ISEL-NEXT: andi. 5, 5, 1 +; CHECK-NO-ISEL-NEXT: mr 5, 3 ; CHECK-NO-ISEL-NEXT: bc 12, 1, .LBB0_1 ; CHECK-NO-ISEL-NEXT: # %bb.4: # %wait_on_buffer.exit1319 ; CHECK-NO-ISEL-NEXT: # -; CHECK-NO-ISEL-NEXT: li 4, -5 +; CHECK-NO-ISEL-NEXT: li 5, -5 ; CHECK-NO-ISEL-NEXT: b .LBB0_1 ; CHECK-NO-ISEL-NEXT: .LBB0_5: -; CHECK-NO-ISEL-NEXT: mr 4, 7 +; CHECK-NO-ISEL-NEXT: mr 5, 7 ; CHECK-NO-ISEL-NEXT: .LBB0_6: # %while.end418 -; CHECK-NO-ISEL-NEXT: cmplwi 4, 0 +; CHECK-NO-ISEL-NEXT: cmplwi 5, 0 ; CHECK-NO-ISEL-NEXT: beq 0, .LBB0_8 ; CHECK-NO-ISEL-NEXT: # %bb.7: # %if.then420 ; CHECK-NO-ISEL-NEXT: .LBB0_8: # %if.end421 diff --git a/llvm/test/CodeGen/PowerPC/vsx.ll b/llvm/test/CodeGen/PowerPC/vsx.ll index 14b3d69f8c273..33d7395693b49 100644 --- a/llvm/test/CodeGen/PowerPC/vsx.ll +++ b/llvm/test/CodeGen/PowerPC/vsx.ll @@ -2487,9 +2487,8 @@ define double @test82(double %a, double %b, double %c, double %d) { ; CHECK-FISL-LABEL: test82: ; CHECK-FISL: # %bb.0: # %entry ; CHECK-FISL-NEXT: stfd f2, -16(r1) # 8-byte Folded Spill -; CHECK-FISL-NEXT: fmr f2, f1 +; CHECK-FISL-NEXT: stfd f1, -8(r1) # 8-byte Folded Spill ; CHECK-FISL-NEXT: xscmpudp cr0, f3, f4 -; CHECK-FISL-NEXT: stfd f2, -8(r1) # 8-byte Folded Spill ; CHECK-FISL-NEXT: beq cr0, .LBB67_2 ; CHECK-FISL-NEXT: # %bb.1: # %entry ; CHECK-FISL-NEXT: lfd f0, -16(r1) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/abds.ll b/llvm/test/CodeGen/RISCV/abds.ll index 28a95ef4f8de9..59862996b8e41 100644 --- a/llvm/test/CodeGen/RISCV/abds.ll +++ b/llvm/test/CodeGen/RISCV/abds.ll @@ -543,17 +543,17 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: lw t1, 12(a2) ; RV32I-NEXT: lw a2, 4(a2) ; RV32I-NEXT: sltu t0, a6, a5 -; RV32I-NEXT: mv t4, t0 +; RV32I-NEXT: mv t3, t0 ; RV32I-NEXT: beq a7, t1, .LBB11_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: slt t4, t1, a7 +; RV32I-NEXT: slt t3, t1, a7 ; RV32I-NEXT: .LBB11_2: ; RV32I-NEXT: sltu t2, a1, a3 ; RV32I-NEXT: sltu t5, a2, a4 -; RV32I-NEXT: mv t3, t2 +; RV32I-NEXT: mv t4, t2 ; RV32I-NEXT: beq a4, a2, .LBB11_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: mv t3, t5 +; RV32I-NEXT: mv t4, t5 ; RV32I-NEXT: .LBB11_4: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill @@ -562,12 +562,12 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: or t6, s0, t6 ; RV32I-NEXT: beqz t6, .LBB11_6 ; RV32I-NEXT: # %bb.5: -; RV32I-NEXT: mv t3, t4 +; RV32I-NEXT: mv t4, t3 ; RV32I-NEXT: .LBB11_6: -; RV32I-NEXT: mv t4, t2 +; RV32I-NEXT: mv t3, t2 ; RV32I-NEXT: beq a2, a4, .LBB11_8 ; RV32I-NEXT: # %bb.7: -; RV32I-NEXT: mv t4, t5 +; RV32I-NEXT: mv t3, t5 ; RV32I-NEXT: .LBB11_8: ; RV32I-NEXT: sltu t5, a3, a1 ; RV32I-NEXT: mv t6, t5 @@ -575,17 +575,17 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: # %bb.9: ; RV32I-NEXT: sltu t6, a4, a2 ; RV32I-NEXT: .LBB11_10: -; RV32I-NEXT: bnez t3, .LBB11_12 +; RV32I-NEXT: bnez t4, .LBB11_12 ; RV32I-NEXT: # %bb.11: ; RV32I-NEXT: sub a7, t1, a7 ; RV32I-NEXT: sub a5, a6, a5 ; RV32I-NEXT: sub a1, a1, a3 ; RV32I-NEXT: sub a2, a2, a4 ; RV32I-NEXT: sub a4, a7, t0 -; RV32I-NEXT: sltu a6, a5, t4 +; RV32I-NEXT: sltu a6, a5, t3 ; RV32I-NEXT: sub a3, a2, t2 ; RV32I-NEXT: sub a2, a4, a6 -; RV32I-NEXT: sub a4, a5, t4 +; RV32I-NEXT: sub a4, a5, t3 ; RV32I-NEXT: j .LBB11_13 ; RV32I-NEXT: .LBB11_12: ; RV32I-NEXT: sltu t0, a5, a6 @@ -639,17 +639,17 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: lw t1, 12(a2) ; RV32ZBB-NEXT: lw a2, 4(a2) ; RV32ZBB-NEXT: sltu t0, a6, a5 -; RV32ZBB-NEXT: mv t4, t0 +; RV32ZBB-NEXT: mv t3, t0 ; RV32ZBB-NEXT: beq a7, t1, .LBB11_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: slt t4, t1, a7 +; RV32ZBB-NEXT: slt t3, t1, a7 ; RV32ZBB-NEXT: .LBB11_2: ; RV32ZBB-NEXT: sltu t2, a1, a3 ; RV32ZBB-NEXT: sltu t5, a2, a4 -; RV32ZBB-NEXT: mv t3, t2 +; RV32ZBB-NEXT: mv t4, t2 ; RV32ZBB-NEXT: beq a4, a2, .LBB11_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: mv t3, t5 +; RV32ZBB-NEXT: mv t4, t5 ; RV32ZBB-NEXT: .LBB11_4: ; RV32ZBB-NEXT: addi sp, sp, -16 ; RV32ZBB-NEXT: sw s0, 12(sp) # 4-byte Folded Spill @@ -658,12 +658,12 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: or t6, s0, t6 ; RV32ZBB-NEXT: beqz t6, .LBB11_6 ; RV32ZBB-NEXT: # %bb.5: -; RV32ZBB-NEXT: mv t3, t4 +; RV32ZBB-NEXT: mv t4, t3 ; RV32ZBB-NEXT: .LBB11_6: -; RV32ZBB-NEXT: mv t4, t2 +; RV32ZBB-NEXT: mv t3, t2 ; RV32ZBB-NEXT: beq a2, a4, .LBB11_8 ; RV32ZBB-NEXT: # %bb.7: -; RV32ZBB-NEXT: mv t4, t5 +; RV32ZBB-NEXT: mv t3, t5 ; RV32ZBB-NEXT: .LBB11_8: ; RV32ZBB-NEXT: sltu t5, a3, a1 ; RV32ZBB-NEXT: mv t6, t5 @@ -671,17 +671,17 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: # %bb.9: ; RV32ZBB-NEXT: sltu t6, a4, a2 ; RV32ZBB-NEXT: .LBB11_10: -; RV32ZBB-NEXT: bnez t3, .LBB11_12 +; RV32ZBB-NEXT: bnez t4, .LBB11_12 ; RV32ZBB-NEXT: # %bb.11: ; RV32ZBB-NEXT: sub a7, t1, a7 ; RV32ZBB-NEXT: sub a5, a6, a5 ; RV32ZBB-NEXT: sub a1, a1, a3 ; RV32ZBB-NEXT: sub a2, a2, a4 ; RV32ZBB-NEXT: sub a4, a7, t0 -; RV32ZBB-NEXT: sltu a6, a5, t4 +; RV32ZBB-NEXT: sltu a6, a5, t3 ; RV32ZBB-NEXT: sub a3, a2, t2 ; RV32ZBB-NEXT: sub a2, a4, a6 -; RV32ZBB-NEXT: sub a4, a5, t4 +; RV32ZBB-NEXT: sub a4, a5, t3 ; RV32ZBB-NEXT: j .LBB11_13 ; RV32ZBB-NEXT: .LBB11_12: ; RV32ZBB-NEXT: sltu t0, a5, a6 @@ -743,17 +743,17 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: lw t1, 12(a2) ; RV32I-NEXT: lw a2, 4(a2) ; RV32I-NEXT: sltu t0, a6, a5 -; RV32I-NEXT: mv t4, t0 +; RV32I-NEXT: mv t3, t0 ; RV32I-NEXT: beq a7, t1, .LBB12_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: slt t4, t1, a7 +; RV32I-NEXT: slt t3, t1, a7 ; RV32I-NEXT: .LBB12_2: ; RV32I-NEXT: sltu t2, a1, a3 ; RV32I-NEXT: sltu t5, a2, a4 -; RV32I-NEXT: mv t3, t2 +; RV32I-NEXT: mv t4, t2 ; RV32I-NEXT: beq a4, a2, .LBB12_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: mv t3, t5 +; RV32I-NEXT: mv t4, t5 ; RV32I-NEXT: .LBB12_4: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill @@ -762,12 +762,12 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: or t6, s0, t6 ; RV32I-NEXT: beqz t6, .LBB12_6 ; RV32I-NEXT: # %bb.5: -; RV32I-NEXT: mv t3, t4 +; RV32I-NEXT: mv t4, t3 ; RV32I-NEXT: .LBB12_6: -; RV32I-NEXT: mv t4, t2 +; RV32I-NEXT: mv t3, t2 ; RV32I-NEXT: beq a2, a4, .LBB12_8 ; RV32I-NEXT: # %bb.7: -; RV32I-NEXT: mv t4, t5 +; RV32I-NEXT: mv t3, t5 ; RV32I-NEXT: .LBB12_8: ; RV32I-NEXT: sltu t5, a3, a1 ; RV32I-NEXT: mv t6, t5 @@ -775,17 +775,17 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: # %bb.9: ; RV32I-NEXT: sltu t6, a4, a2 ; RV32I-NEXT: .LBB12_10: -; RV32I-NEXT: bnez t3, .LBB12_12 +; RV32I-NEXT: bnez t4, .LBB12_12 ; RV32I-NEXT: # %bb.11: ; RV32I-NEXT: sub a7, t1, a7 ; RV32I-NEXT: sub a5, a6, a5 ; RV32I-NEXT: sub a1, a1, a3 ; RV32I-NEXT: sub a2, a2, a4 ; RV32I-NEXT: sub a4, a7, t0 -; RV32I-NEXT: sltu a6, a5, t4 +; RV32I-NEXT: sltu a6, a5, t3 ; RV32I-NEXT: sub a3, a2, t2 ; RV32I-NEXT: sub a2, a4, a6 -; RV32I-NEXT: sub a4, a5, t4 +; RV32I-NEXT: sub a4, a5, t3 ; RV32I-NEXT: j .LBB12_13 ; RV32I-NEXT: .LBB12_12: ; RV32I-NEXT: sltu t0, a5, a6 @@ -839,17 +839,17 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: lw t1, 12(a2) ; RV32ZBB-NEXT: lw a2, 4(a2) ; RV32ZBB-NEXT: sltu t0, a6, a5 -; RV32ZBB-NEXT: mv t4, t0 +; RV32ZBB-NEXT: mv t3, t0 ; RV32ZBB-NEXT: beq a7, t1, .LBB12_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: slt t4, t1, a7 +; RV32ZBB-NEXT: slt t3, t1, a7 ; RV32ZBB-NEXT: .LBB12_2: ; RV32ZBB-NEXT: sltu t2, a1, a3 ; RV32ZBB-NEXT: sltu t5, a2, a4 -; RV32ZBB-NEXT: mv t3, t2 +; RV32ZBB-NEXT: mv t4, t2 ; RV32ZBB-NEXT: beq a4, a2, .LBB12_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: mv t3, t5 +; RV32ZBB-NEXT: mv t4, t5 ; RV32ZBB-NEXT: .LBB12_4: ; RV32ZBB-NEXT: addi sp, sp, -16 ; RV32ZBB-NEXT: sw s0, 12(sp) # 4-byte Folded Spill @@ -858,12 +858,12 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: or t6, s0, t6 ; RV32ZBB-NEXT: beqz t6, .LBB12_6 ; RV32ZBB-NEXT: # %bb.5: -; RV32ZBB-NEXT: mv t3, t4 +; RV32ZBB-NEXT: mv t4, t3 ; RV32ZBB-NEXT: .LBB12_6: -; RV32ZBB-NEXT: mv t4, t2 +; RV32ZBB-NEXT: mv t3, t2 ; RV32ZBB-NEXT: beq a2, a4, .LBB12_8 ; RV32ZBB-NEXT: # %bb.7: -; RV32ZBB-NEXT: mv t4, t5 +; RV32ZBB-NEXT: mv t3, t5 ; RV32ZBB-NEXT: .LBB12_8: ; RV32ZBB-NEXT: sltu t5, a3, a1 ; RV32ZBB-NEXT: mv t6, t5 @@ -871,17 +871,17 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: # %bb.9: ; RV32ZBB-NEXT: sltu t6, a4, a2 ; RV32ZBB-NEXT: .LBB12_10: -; RV32ZBB-NEXT: bnez t3, .LBB12_12 +; RV32ZBB-NEXT: bnez t4, .LBB12_12 ; RV32ZBB-NEXT: # %bb.11: ; RV32ZBB-NEXT: sub a7, t1, a7 ; RV32ZBB-NEXT: sub a5, a6, a5 ; RV32ZBB-NEXT: sub a1, a1, a3 ; RV32ZBB-NEXT: sub a2, a2, a4 ; RV32ZBB-NEXT: sub a4, a7, t0 -; RV32ZBB-NEXT: sltu a6, a5, t4 +; RV32ZBB-NEXT: sltu a6, a5, t3 ; RV32ZBB-NEXT: sub a3, a2, t2 ; RV32ZBB-NEXT: sub a2, a4, a6 -; RV32ZBB-NEXT: sub a4, a5, t4 +; RV32ZBB-NEXT: sub a4, a5, t3 ; RV32ZBB-NEXT: j .LBB12_13 ; RV32ZBB-NEXT: .LBB12_12: ; RV32ZBB-NEXT: sltu t0, a5, a6 @@ -1132,17 +1132,17 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: lw t1, 12(a2) ; RV32I-NEXT: lw a2, 4(a2) ; RV32I-NEXT: sltu t0, a6, a5 -; RV32I-NEXT: mv t4, t0 +; RV32I-NEXT: mv t3, t0 ; RV32I-NEXT: beq a7, t1, .LBB17_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: slt t4, t1, a7 +; RV32I-NEXT: slt t3, t1, a7 ; RV32I-NEXT: .LBB17_2: ; RV32I-NEXT: sltu t2, a1, a3 ; RV32I-NEXT: sltu t5, a2, a4 -; RV32I-NEXT: mv t3, t2 +; RV32I-NEXT: mv t4, t2 ; RV32I-NEXT: beq a4, a2, .LBB17_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: mv t3, t5 +; RV32I-NEXT: mv t4, t5 ; RV32I-NEXT: .LBB17_4: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill @@ -1151,12 +1151,12 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: or t6, s0, t6 ; RV32I-NEXT: beqz t6, .LBB17_6 ; RV32I-NEXT: # %bb.5: -; RV32I-NEXT: mv t3, t4 +; RV32I-NEXT: mv t4, t3 ; RV32I-NEXT: .LBB17_6: -; RV32I-NEXT: mv t4, t2 +; RV32I-NEXT: mv t3, t2 ; RV32I-NEXT: beq a2, a4, .LBB17_8 ; RV32I-NEXT: # %bb.7: -; RV32I-NEXT: mv t4, t5 +; RV32I-NEXT: mv t3, t5 ; RV32I-NEXT: .LBB17_8: ; RV32I-NEXT: sltu t5, a3, a1 ; RV32I-NEXT: mv t6, t5 @@ -1164,17 +1164,17 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: # %bb.9: ; RV32I-NEXT: sltu t6, a4, a2 ; RV32I-NEXT: .LBB17_10: -; RV32I-NEXT: bnez t3, .LBB17_12 +; RV32I-NEXT: bnez t4, .LBB17_12 ; RV32I-NEXT: # %bb.11: ; RV32I-NEXT: sub a7, t1, a7 ; RV32I-NEXT: sub a5, a6, a5 ; RV32I-NEXT: sub a1, a1, a3 ; RV32I-NEXT: sub a2, a2, a4 ; RV32I-NEXT: sub a4, a7, t0 -; RV32I-NEXT: sltu a6, a5, t4 +; RV32I-NEXT: sltu a6, a5, t3 ; RV32I-NEXT: sub a3, a2, t2 ; RV32I-NEXT: sub a2, a4, a6 -; RV32I-NEXT: sub a4, a5, t4 +; RV32I-NEXT: sub a4, a5, t3 ; RV32I-NEXT: j .LBB17_13 ; RV32I-NEXT: .LBB17_12: ; RV32I-NEXT: sltu t0, a5, a6 @@ -1228,17 +1228,17 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: lw t1, 12(a2) ; RV32ZBB-NEXT: lw a2, 4(a2) ; RV32ZBB-NEXT: sltu t0, a6, a5 -; RV32ZBB-NEXT: mv t4, t0 +; RV32ZBB-NEXT: mv t3, t0 ; RV32ZBB-NEXT: beq a7, t1, .LBB17_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: slt t4, t1, a7 +; RV32ZBB-NEXT: slt t3, t1, a7 ; RV32ZBB-NEXT: .LBB17_2: ; RV32ZBB-NEXT: sltu t2, a1, a3 ; RV32ZBB-NEXT: sltu t5, a2, a4 -; RV32ZBB-NEXT: mv t3, t2 +; RV32ZBB-NEXT: mv t4, t2 ; RV32ZBB-NEXT: beq a4, a2, .LBB17_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: mv t3, t5 +; RV32ZBB-NEXT: mv t4, t5 ; RV32ZBB-NEXT: .LBB17_4: ; RV32ZBB-NEXT: addi sp, sp, -16 ; RV32ZBB-NEXT: sw s0, 12(sp) # 4-byte Folded Spill @@ -1247,12 +1247,12 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: or t6, s0, t6 ; RV32ZBB-NEXT: beqz t6, .LBB17_6 ; RV32ZBB-NEXT: # %bb.5: -; RV32ZBB-NEXT: mv t3, t4 +; RV32ZBB-NEXT: mv t4, t3 ; RV32ZBB-NEXT: .LBB17_6: -; RV32ZBB-NEXT: mv t4, t2 +; RV32ZBB-NEXT: mv t3, t2 ; RV32ZBB-NEXT: beq a2, a4, .LBB17_8 ; RV32ZBB-NEXT: # %bb.7: -; RV32ZBB-NEXT: mv t4, t5 +; RV32ZBB-NEXT: mv t3, t5 ; RV32ZBB-NEXT: .LBB17_8: ; RV32ZBB-NEXT: sltu t5, a3, a1 ; RV32ZBB-NEXT: mv t6, t5 @@ -1260,17 +1260,17 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: # %bb.9: ; RV32ZBB-NEXT: sltu t6, a4, a2 ; RV32ZBB-NEXT: .LBB17_10: -; RV32ZBB-NEXT: bnez t3, .LBB17_12 +; RV32ZBB-NEXT: bnez t4, .LBB17_12 ; RV32ZBB-NEXT: # %bb.11: ; RV32ZBB-NEXT: sub a7, t1, a7 ; RV32ZBB-NEXT: sub a5, a6, a5 ; RV32ZBB-NEXT: sub a1, a1, a3 ; RV32ZBB-NEXT: sub a2, a2, a4 ; RV32ZBB-NEXT: sub a4, a7, t0 -; RV32ZBB-NEXT: sltu a6, a5, t4 +; RV32ZBB-NEXT: sltu a6, a5, t3 ; RV32ZBB-NEXT: sub a3, a2, t2 ; RV32ZBB-NEXT: sub a2, a4, a6 -; RV32ZBB-NEXT: sub a4, a5, t4 +; RV32ZBB-NEXT: sub a4, a5, t3 ; RV32ZBB-NEXT: j .LBB17_13 ; RV32ZBB-NEXT: .LBB17_12: ; RV32ZBB-NEXT: sltu t0, a5, a6 @@ -1523,17 +1523,17 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: lw t1, 12(a2) ; RV32I-NEXT: lw a2, 4(a2) ; RV32I-NEXT: sltu t0, a6, a5 -; RV32I-NEXT: mv t4, t0 +; RV32I-NEXT: mv t3, t0 ; RV32I-NEXT: beq a7, t1, .LBB22_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: slt t4, t1, a7 +; RV32I-NEXT: slt t3, t1, a7 ; RV32I-NEXT: .LBB22_2: ; RV32I-NEXT: sltu t2, a1, a3 ; RV32I-NEXT: sltu t5, a2, a4 -; RV32I-NEXT: mv t3, t2 +; RV32I-NEXT: mv t4, t2 ; RV32I-NEXT: beq a4, a2, .LBB22_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: mv t3, t5 +; RV32I-NEXT: mv t4, t5 ; RV32I-NEXT: .LBB22_4: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill @@ -1542,12 +1542,12 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: or t6, s0, t6 ; RV32I-NEXT: beqz t6, .LBB22_6 ; RV32I-NEXT: # %bb.5: -; RV32I-NEXT: mv t3, t4 +; RV32I-NEXT: mv t4, t3 ; RV32I-NEXT: .LBB22_6: -; RV32I-NEXT: mv t4, t2 +; RV32I-NEXT: mv t3, t2 ; RV32I-NEXT: beq a2, a4, .LBB22_8 ; RV32I-NEXT: # %bb.7: -; RV32I-NEXT: mv t4, t5 +; RV32I-NEXT: mv t3, t5 ; RV32I-NEXT: .LBB22_8: ; RV32I-NEXT: sltu t5, a3, a1 ; RV32I-NEXT: mv t6, t5 @@ -1555,17 +1555,17 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: # %bb.9: ; RV32I-NEXT: sltu t6, a4, a2 ; RV32I-NEXT: .LBB22_10: -; RV32I-NEXT: bnez t3, .LBB22_12 +; RV32I-NEXT: bnez t4, .LBB22_12 ; RV32I-NEXT: # %bb.11: ; RV32I-NEXT: sub a7, t1, a7 ; RV32I-NEXT: sub a5, a6, a5 ; RV32I-NEXT: sub a1, a1, a3 ; RV32I-NEXT: sub a2, a2, a4 ; RV32I-NEXT: sub a4, a7, t0 -; RV32I-NEXT: sltu a6, a5, t4 +; RV32I-NEXT: sltu a6, a5, t3 ; RV32I-NEXT: sub a3, a2, t2 ; RV32I-NEXT: sub a2, a4, a6 -; RV32I-NEXT: sub a4, a5, t4 +; RV32I-NEXT: sub a4, a5, t3 ; RV32I-NEXT: j .LBB22_13 ; RV32I-NEXT: .LBB22_12: ; RV32I-NEXT: sltu t0, a5, a6 @@ -1619,17 +1619,17 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: lw t1, 12(a2) ; RV32ZBB-NEXT: lw a2, 4(a2) ; RV32ZBB-NEXT: sltu t0, a6, a5 -; RV32ZBB-NEXT: mv t4, t0 +; RV32ZBB-NEXT: mv t3, t0 ; RV32ZBB-NEXT: beq a7, t1, .LBB22_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: slt t4, t1, a7 +; RV32ZBB-NEXT: slt t3, t1, a7 ; RV32ZBB-NEXT: .LBB22_2: ; RV32ZBB-NEXT: sltu t2, a1, a3 ; RV32ZBB-NEXT: sltu t5, a2, a4 -; RV32ZBB-NEXT: mv t3, t2 +; RV32ZBB-NEXT: mv t4, t2 ; RV32ZBB-NEXT: beq a4, a2, .LBB22_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: mv t3, t5 +; RV32ZBB-NEXT: mv t4, t5 ; RV32ZBB-NEXT: .LBB22_4: ; RV32ZBB-NEXT: addi sp, sp, -16 ; RV32ZBB-NEXT: sw s0, 12(sp) # 4-byte Folded Spill @@ -1638,12 +1638,12 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: or t6, s0, t6 ; RV32ZBB-NEXT: beqz t6, .LBB22_6 ; RV32ZBB-NEXT: # %bb.5: -; RV32ZBB-NEXT: mv t3, t4 +; RV32ZBB-NEXT: mv t4, t3 ; RV32ZBB-NEXT: .LBB22_6: -; RV32ZBB-NEXT: mv t4, t2 +; RV32ZBB-NEXT: mv t3, t2 ; RV32ZBB-NEXT: beq a2, a4, .LBB22_8 ; RV32ZBB-NEXT: # %bb.7: -; RV32ZBB-NEXT: mv t4, t5 +; RV32ZBB-NEXT: mv t3, t5 ; RV32ZBB-NEXT: .LBB22_8: ; RV32ZBB-NEXT: sltu t5, a3, a1 ; RV32ZBB-NEXT: mv t6, t5 @@ -1651,17 +1651,17 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: # %bb.9: ; RV32ZBB-NEXT: sltu t6, a4, a2 ; RV32ZBB-NEXT: .LBB22_10: -; RV32ZBB-NEXT: bnez t3, .LBB22_12 +; RV32ZBB-NEXT: bnez t4, .LBB22_12 ; RV32ZBB-NEXT: # %bb.11: ; RV32ZBB-NEXT: sub a7, t1, a7 ; RV32ZBB-NEXT: sub a5, a6, a5 ; RV32ZBB-NEXT: sub a1, a1, a3 ; RV32ZBB-NEXT: sub a2, a2, a4 ; RV32ZBB-NEXT: sub a4, a7, t0 -; RV32ZBB-NEXT: sltu a6, a5, t4 +; RV32ZBB-NEXT: sltu a6, a5, t3 ; RV32ZBB-NEXT: sub a3, a2, t2 ; RV32ZBB-NEXT: sub a2, a4, a6 -; RV32ZBB-NEXT: sub a4, a5, t4 +; RV32ZBB-NEXT: sub a4, a5, t3 ; RV32ZBB-NEXT: j .LBB22_13 ; RV32ZBB-NEXT: .LBB22_12: ; RV32ZBB-NEXT: sltu t0, a5, a6 @@ -2514,17 +2514,17 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: lw t1, 12(a2) ; RV32I-NEXT: lw a2, 4(a2) ; RV32I-NEXT: sltu t0, a6, a5 -; RV32I-NEXT: mv t4, t0 +; RV32I-NEXT: mv t3, t0 ; RV32I-NEXT: beq a7, t1, .LBB38_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: slt t4, t1, a7 +; RV32I-NEXT: slt t3, t1, a7 ; RV32I-NEXT: .LBB38_2: ; RV32I-NEXT: sltu t2, a1, a3 ; RV32I-NEXT: sltu t5, a2, a4 -; RV32I-NEXT: mv t3, t2 +; RV32I-NEXT: mv t4, t2 ; RV32I-NEXT: beq a4, a2, .LBB38_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: mv t3, t5 +; RV32I-NEXT: mv t4, t5 ; RV32I-NEXT: .LBB38_4: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill @@ -2533,12 +2533,12 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: or t6, s0, t6 ; RV32I-NEXT: beqz t6, .LBB38_6 ; RV32I-NEXT: # %bb.5: -; RV32I-NEXT: mv t3, t4 +; RV32I-NEXT: mv t4, t3 ; RV32I-NEXT: .LBB38_6: -; RV32I-NEXT: mv t4, t2 +; RV32I-NEXT: mv t3, t2 ; RV32I-NEXT: beq a2, a4, .LBB38_8 ; RV32I-NEXT: # %bb.7: -; RV32I-NEXT: mv t4, t5 +; RV32I-NEXT: mv t3, t5 ; RV32I-NEXT: .LBB38_8: ; RV32I-NEXT: sltu t5, a3, a1 ; RV32I-NEXT: mv t6, t5 @@ -2546,17 +2546,17 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: # %bb.9: ; RV32I-NEXT: sltu t6, a4, a2 ; RV32I-NEXT: .LBB38_10: -; RV32I-NEXT: bnez t3, .LBB38_12 +; RV32I-NEXT: bnez t4, .LBB38_12 ; RV32I-NEXT: # %bb.11: ; RV32I-NEXT: sub a7, t1, a7 ; RV32I-NEXT: sub a5, a6, a5 ; RV32I-NEXT: sub a1, a1, a3 ; RV32I-NEXT: sub a2, a2, a4 ; RV32I-NEXT: sub a4, a7, t0 -; RV32I-NEXT: sltu a6, a5, t4 +; RV32I-NEXT: sltu a6, a5, t3 ; RV32I-NEXT: sub a3, a2, t2 ; RV32I-NEXT: sub a2, a4, a6 -; RV32I-NEXT: sub a4, a5, t4 +; RV32I-NEXT: sub a4, a5, t3 ; RV32I-NEXT: j .LBB38_13 ; RV32I-NEXT: .LBB38_12: ; RV32I-NEXT: sltu t0, a5, a6 @@ -2610,17 +2610,17 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: lw t1, 12(a2) ; RV32ZBB-NEXT: lw a2, 4(a2) ; RV32ZBB-NEXT: sltu t0, a6, a5 -; RV32ZBB-NEXT: mv t4, t0 +; RV32ZBB-NEXT: mv t3, t0 ; RV32ZBB-NEXT: beq a7, t1, .LBB38_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: slt t4, t1, a7 +; RV32ZBB-NEXT: slt t3, t1, a7 ; RV32ZBB-NEXT: .LBB38_2: ; RV32ZBB-NEXT: sltu t2, a1, a3 ; RV32ZBB-NEXT: sltu t5, a2, a4 -; RV32ZBB-NEXT: mv t3, t2 +; RV32ZBB-NEXT: mv t4, t2 ; RV32ZBB-NEXT: beq a4, a2, .LBB38_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: mv t3, t5 +; RV32ZBB-NEXT: mv t4, t5 ; RV32ZBB-NEXT: .LBB38_4: ; RV32ZBB-NEXT: addi sp, sp, -16 ; RV32ZBB-NEXT: sw s0, 12(sp) # 4-byte Folded Spill @@ -2629,12 +2629,12 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: or t6, s0, t6 ; RV32ZBB-NEXT: beqz t6, .LBB38_6 ; RV32ZBB-NEXT: # %bb.5: -; RV32ZBB-NEXT: mv t3, t4 +; RV32ZBB-NEXT: mv t4, t3 ; RV32ZBB-NEXT: .LBB38_6: -; RV32ZBB-NEXT: mv t4, t2 +; RV32ZBB-NEXT: mv t3, t2 ; RV32ZBB-NEXT: beq a2, a4, .LBB38_8 ; RV32ZBB-NEXT: # %bb.7: -; RV32ZBB-NEXT: mv t4, t5 +; RV32ZBB-NEXT: mv t3, t5 ; RV32ZBB-NEXT: .LBB38_8: ; RV32ZBB-NEXT: sltu t5, a3, a1 ; RV32ZBB-NEXT: mv t6, t5 @@ -2642,17 +2642,17 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: # %bb.9: ; RV32ZBB-NEXT: sltu t6, a4, a2 ; RV32ZBB-NEXT: .LBB38_10: -; RV32ZBB-NEXT: bnez t3, .LBB38_12 +; RV32ZBB-NEXT: bnez t4, .LBB38_12 ; RV32ZBB-NEXT: # %bb.11: ; RV32ZBB-NEXT: sub a7, t1, a7 ; RV32ZBB-NEXT: sub a5, a6, a5 ; RV32ZBB-NEXT: sub a1, a1, a3 ; RV32ZBB-NEXT: sub a2, a2, a4 ; RV32ZBB-NEXT: sub a4, a7, t0 -; RV32ZBB-NEXT: sltu a6, a5, t4 +; RV32ZBB-NEXT: sltu a6, a5, t3 ; RV32ZBB-NEXT: sub a3, a2, t2 ; RV32ZBB-NEXT: sub a2, a4, a6 -; RV32ZBB-NEXT: sub a4, a5, t4 +; RV32ZBB-NEXT: sub a4, a5, t3 ; RV32ZBB-NEXT: j .LBB38_13 ; RV32ZBB-NEXT: .LBB38_12: ; RV32ZBB-NEXT: sltu t0, a5, a6 diff --git a/llvm/test/CodeGen/RISCV/machine-pipeliner.ll b/llvm/test/CodeGen/RISCV/machine-pipeliner.ll index d250098576687..bf81d03ec1352 100644 --- a/llvm/test/CodeGen/RISCV/machine-pipeliner.ll +++ b/llvm/test/CodeGen/RISCV/machine-pipeliner.ll @@ -52,18 +52,18 @@ define void @test_pipelined_1(ptr noalias %in, ptr noalias %out, i32 signext %cn ; ; CHECK-PIPELINED-LABEL: test_pipelined_1: ; CHECK-PIPELINED: # %bb.0: # %entry -; CHECK-PIPELINED-NEXT: blez a2, .LBB1_6 +; CHECK-PIPELINED-NEXT: blez a2, .LBB1_7 ; CHECK-PIPELINED-NEXT: # %bb.1: # %for.body.preheader ; CHECK-PIPELINED-NEXT: lw a4, 0(a1) ; CHECK-PIPELINED-NEXT: addi a2, a2, -1 +; CHECK-PIPELINED-NEXT: addi a3, a0, 4 ; CHECK-PIPELINED-NEXT: sh2add.uw a6, a2, a1 -; CHECK-PIPELINED-NEXT: addi a2, a0, 4 ; CHECK-PIPELINED-NEXT: addi a1, a1, 4 ; CHECK-PIPELINED-NEXT: addi a6, a6, 4 ; CHECK-PIPELINED-NEXT: beq a1, a6, .LBB1_5 ; CHECK-PIPELINED-NEXT: # %bb.2: # %for.body ; CHECK-PIPELINED-NEXT: lw a5, 0(a1) -; CHECK-PIPELINED-NEXT: addi a3, a2, 4 +; CHECK-PIPELINED-NEXT: addi a2, a3, 4 ; CHECK-PIPELINED-NEXT: addi a4, a4, 1 ; CHECK-PIPELINED-NEXT: addi a1, a1, 4 ; CHECK-PIPELINED-NEXT: beq a1, a6, .LBB1_4 @@ -72,20 +72,22 @@ define void @test_pipelined_1(ptr noalias %in, ptr noalias %out, i32 signext %cn ; CHECK-PIPELINED-NEXT: sw a4, 0(a0) ; CHECK-PIPELINED-NEXT: mv a4, a5 ; CHECK-PIPELINED-NEXT: lw a5, 0(a1) -; CHECK-PIPELINED-NEXT: mv a0, a2 -; CHECK-PIPELINED-NEXT: mv a2, a3 -; CHECK-PIPELINED-NEXT: addi a3, a3, 4 +; CHECK-PIPELINED-NEXT: mv a0, a3 +; CHECK-PIPELINED-NEXT: mv a3, a2 +; CHECK-PIPELINED-NEXT: addi a2, a2, 4 ; CHECK-PIPELINED-NEXT: addi a4, a4, 1 ; CHECK-PIPELINED-NEXT: addi a1, a1, 4 ; CHECK-PIPELINED-NEXT: bne a1, a6, .LBB1_3 ; CHECK-PIPELINED-NEXT: .LBB1_4: ; CHECK-PIPELINED-NEXT: sw a4, 0(a0) -; CHECK-PIPELINED-NEXT: mv a0, a2 -; CHECK-PIPELINED-NEXT: mv a4, a5 +; CHECK-PIPELINED-NEXT: j .LBB1_6 ; CHECK-PIPELINED-NEXT: .LBB1_5: -; CHECK-PIPELINED-NEXT: addi a4, a4, 1 -; CHECK-PIPELINED-NEXT: sw a4, 0(a0) -; CHECK-PIPELINED-NEXT: .LBB1_6: # %for.end +; CHECK-PIPELINED-NEXT: mv a3, a0 +; CHECK-PIPELINED-NEXT: mv a5, a4 +; CHECK-PIPELINED-NEXT: .LBB1_6: +; CHECK-PIPELINED-NEXT: addi a5, a5, 1 +; CHECK-PIPELINED-NEXT: sw a5, 0(a3) +; CHECK-PIPELINED-NEXT: .LBB1_7: # %for.end ; CHECK-PIPELINED-NEXT: ret entry: %cmp = icmp sgt i32 %cnt, 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll index 67d55366674f3..022b5c8ba6df5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -6571,53 +6571,53 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m, ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: lw a4, 32(a2) ; RV32ZVE32F-NEXT: lw a5, 40(a2) -; RV32ZVE32F-NEXT: lw a6, 48(a2) -; RV32ZVE32F-NEXT: lw a7, 56(a2) -; RV32ZVE32F-NEXT: lw t0, 0(a2) +; RV32ZVE32F-NEXT: lw a7, 48(a2) +; RV32ZVE32F-NEXT: lw t0, 56(a2) +; RV32ZVE32F-NEXT: lw a6, 0(a2) ; RV32ZVE32F-NEXT: lw t1, 8(a2) ; RV32ZVE32F-NEXT: lw t2, 16(a2) ; RV32ZVE32F-NEXT: lw a2, 24(a2) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vmv.v.x v8, t0 +; RV32ZVE32F-NEXT: vmv.v.x v8, a6 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s t0, v0 +; RV32ZVE32F-NEXT: vmv.x.s a6, v0 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t1 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t2 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a2 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a4 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a5 -; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a6 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a7 +; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t0 ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 -; RV32ZVE32F-NEXT: andi a2, t0, 1 +; RV32ZVE32F-NEXT: andi a2, a6, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: beqz a2, .LBB57_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 ; RV32ZVE32F-NEXT: lw a1, 0(a2) ; RV32ZVE32F-NEXT: lw a2, 4(a2) -; RV32ZVE32F-NEXT: andi a4, t0, 2 +; RV32ZVE32F-NEXT: andi a4, a6, 2 ; RV32ZVE32F-NEXT: bnez a4, .LBB57_8 ; RV32ZVE32F-NEXT: .LBB57_2: ; RV32ZVE32F-NEXT: lw a4, 8(a3) ; RV32ZVE32F-NEXT: lw a5, 12(a3) -; RV32ZVE32F-NEXT: andi a6, t0, 4 -; RV32ZVE32F-NEXT: bnez a6, .LBB57_9 +; RV32ZVE32F-NEXT: andi a7, a6, 4 +; RV32ZVE32F-NEXT: bnez a7, .LBB57_9 ; RV32ZVE32F-NEXT: .LBB57_3: -; RV32ZVE32F-NEXT: lw a6, 16(a3) -; RV32ZVE32F-NEXT: lw a7, 20(a3) -; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: lw a7, 16(a3) +; RV32ZVE32F-NEXT: lw t0, 20(a3) +; RV32ZVE32F-NEXT: andi t1, a6, 8 ; RV32ZVE32F-NEXT: bnez t1, .LBB57_10 ; RV32ZVE32F-NEXT: .LBB57_4: ; RV32ZVE32F-NEXT: lw t1, 24(a3) ; RV32ZVE32F-NEXT: lw t2, 28(a3) -; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: andi t3, a6, 16 ; RV32ZVE32F-NEXT: bnez t3, .LBB57_11 ; RV32ZVE32F-NEXT: .LBB57_5: ; RV32ZVE32F-NEXT: lw t3, 32(a3) ; RV32ZVE32F-NEXT: lw t4, 36(a3) -; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: andi t5, a6, 32 ; RV32ZVE32F-NEXT: bnez t5, .LBB57_12 ; RV32ZVE32F-NEXT: .LBB57_6: ; RV32ZVE32F-NEXT: lw t5, 40(a3) @@ -6626,7 +6626,7 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m, ; RV32ZVE32F-NEXT: .LBB57_7: ; RV32ZVE32F-NEXT: lw a1, 0(a3) ; RV32ZVE32F-NEXT: lw a2, 4(a3) -; RV32ZVE32F-NEXT: andi a4, t0, 2 +; RV32ZVE32F-NEXT: andi a4, a6, 2 ; RV32ZVE32F-NEXT: beqz a4, .LBB57_2 ; RV32ZVE32F-NEXT: .LBB57_8: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma @@ -6634,15 +6634,15 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m, ; RV32ZVE32F-NEXT: vmv.x.s a5, v10 ; RV32ZVE32F-NEXT: lw a4, 0(a5) ; RV32ZVE32F-NEXT: lw a5, 4(a5) -; RV32ZVE32F-NEXT: andi a6, t0, 4 -; RV32ZVE32F-NEXT: beqz a6, .LBB57_3 +; RV32ZVE32F-NEXT: andi a7, a6, 4 +; RV32ZVE32F-NEXT: beqz a7, .LBB57_3 ; RV32ZVE32F-NEXT: .LBB57_9: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 -; RV32ZVE32F-NEXT: vmv.x.s a7, v10 -; RV32ZVE32F-NEXT: lw a6, 0(a7) -; RV32ZVE32F-NEXT: lw a7, 4(a7) -; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: vmv.x.s t0, v10 +; RV32ZVE32F-NEXT: lw a7, 0(t0) +; RV32ZVE32F-NEXT: lw t0, 4(t0) +; RV32ZVE32F-NEXT: andi t1, a6, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB57_4 ; RV32ZVE32F-NEXT: .LBB57_10: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma @@ -6650,7 +6650,7 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m, ; RV32ZVE32F-NEXT: vmv.x.s t2, v10 ; RV32ZVE32F-NEXT: lw t1, 0(t2) ; RV32ZVE32F-NEXT: lw t2, 4(t2) -; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: andi t3, a6, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB57_5 ; RV32ZVE32F-NEXT: .LBB57_11: # %cond.load10 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -6658,7 +6658,7 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m, ; RV32ZVE32F-NEXT: vmv.x.s t4, v10 ; RV32ZVE32F-NEXT: lw t3, 0(t4) ; RV32ZVE32F-NEXT: lw t4, 4(t4) -; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: andi t5, a6, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB57_6 ; RV32ZVE32F-NEXT: .LBB57_12: # %cond.load13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -6673,7 +6673,7 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m, ; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: .cfi_offset s0, -4 ; RV32ZVE32F-NEXT: .cfi_offset s1, -8 -; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: andi s0, a6, 64 ; RV32ZVE32F-NEXT: beqz s0, .LBB57_16 ; RV32ZVE32F-NEXT: # %bb.14: # %cond.load16 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -6681,30 +6681,30 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m, ; RV32ZVE32F-NEXT: vmv.x.s s1, v10 ; RV32ZVE32F-NEXT: lw s0, 0(s1) ; RV32ZVE32F-NEXT: lw s1, 4(s1) -; RV32ZVE32F-NEXT: andi t0, t0, -128 -; RV32ZVE32F-NEXT: bnez t0, .LBB57_17 +; RV32ZVE32F-NEXT: andi a6, a6, -128 +; RV32ZVE32F-NEXT: bnez a6, .LBB57_17 ; RV32ZVE32F-NEXT: .LBB57_15: -; RV32ZVE32F-NEXT: lw t0, 56(a3) +; RV32ZVE32F-NEXT: lw a6, 56(a3) ; RV32ZVE32F-NEXT: lw a3, 60(a3) ; RV32ZVE32F-NEXT: j .LBB57_18 ; RV32ZVE32F-NEXT: .LBB57_16: ; RV32ZVE32F-NEXT: lw s0, 48(a3) ; RV32ZVE32F-NEXT: lw s1, 52(a3) -; RV32ZVE32F-NEXT: andi t0, t0, -128 -; RV32ZVE32F-NEXT: beqz t0, .LBB57_15 +; RV32ZVE32F-NEXT: andi a6, a6, -128 +; RV32ZVE32F-NEXT: beqz a6, .LBB57_15 ; RV32ZVE32F-NEXT: .LBB57_17: # %cond.load19 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 -; RV32ZVE32F-NEXT: lw t0, 0(a3) +; RV32ZVE32F-NEXT: lw a6, 0(a3) ; RV32ZVE32F-NEXT: lw a3, 4(a3) ; RV32ZVE32F-NEXT: .LBB57_18: # %else20 ; RV32ZVE32F-NEXT: sw a1, 0(a0) ; RV32ZVE32F-NEXT: sw a2, 4(a0) ; RV32ZVE32F-NEXT: sw a4, 8(a0) ; RV32ZVE32F-NEXT: sw a5, 12(a0) -; RV32ZVE32F-NEXT: sw a6, 16(a0) -; RV32ZVE32F-NEXT: sw a7, 20(a0) +; RV32ZVE32F-NEXT: sw a7, 16(a0) +; RV32ZVE32F-NEXT: sw t0, 20(a0) ; RV32ZVE32F-NEXT: sw t1, 24(a0) ; RV32ZVE32F-NEXT: sw t2, 28(a0) ; RV32ZVE32F-NEXT: sw t3, 32(a0) @@ -6713,7 +6713,7 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m, ; RV32ZVE32F-NEXT: sw t6, 44(a0) ; RV32ZVE32F-NEXT: sw s0, 48(a0) ; RV32ZVE32F-NEXT: sw s1, 52(a0) -; RV32ZVE32F-NEXT: sw t0, 56(a0) +; RV32ZVE32F-NEXT: sw a6, 56(a0) ; RV32ZVE32F-NEXT: sw a3, 60(a0) ; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload @@ -6726,89 +6726,89 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m, ; RV64ZVE32F-LABEL: mgather_baseidx_v8i64: ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a7, v0 -; RV64ZVE32F-NEXT: andi a4, a7, 1 +; RV64ZVE32F-NEXT: vmv.x.s a6, v0 +; RV64ZVE32F-NEXT: andi a4, a6, 1 ; RV64ZVE32F-NEXT: beqz a4, .LBB57_9 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: ld a4, 0(a2) ; RV64ZVE32F-NEXT: slli a4, a4, 3 ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: ld a4, 0(a4) -; RV64ZVE32F-NEXT: andi a5, a7, 2 +; RV64ZVE32F-NEXT: andi a5, a6, 2 ; RV64ZVE32F-NEXT: bnez a5, .LBB57_10 ; RV64ZVE32F-NEXT: .LBB57_2: ; RV64ZVE32F-NEXT: ld a5, 8(a3) -; RV64ZVE32F-NEXT: andi a6, a7, 4 -; RV64ZVE32F-NEXT: bnez a6, .LBB57_11 +; RV64ZVE32F-NEXT: andi a7, a6, 4 +; RV64ZVE32F-NEXT: bnez a7, .LBB57_11 ; RV64ZVE32F-NEXT: .LBB57_3: -; RV64ZVE32F-NEXT: ld a6, 16(a3) -; RV64ZVE32F-NEXT: andi t0, a7, 8 +; RV64ZVE32F-NEXT: ld a7, 16(a3) +; RV64ZVE32F-NEXT: andi t0, a6, 8 ; RV64ZVE32F-NEXT: bnez t0, .LBB57_12 ; RV64ZVE32F-NEXT: .LBB57_4: ; RV64ZVE32F-NEXT: ld t0, 24(a3) -; RV64ZVE32F-NEXT: andi t1, a7, 16 +; RV64ZVE32F-NEXT: andi t1, a6, 16 ; RV64ZVE32F-NEXT: bnez t1, .LBB57_13 ; RV64ZVE32F-NEXT: .LBB57_5: ; RV64ZVE32F-NEXT: ld t1, 32(a3) -; RV64ZVE32F-NEXT: andi t2, a7, 32 +; RV64ZVE32F-NEXT: andi t2, a6, 32 ; RV64ZVE32F-NEXT: bnez t2, .LBB57_14 ; RV64ZVE32F-NEXT: .LBB57_6: ; RV64ZVE32F-NEXT: ld t2, 40(a3) -; RV64ZVE32F-NEXT: andi t3, a7, 64 +; RV64ZVE32F-NEXT: andi t3, a6, 64 ; RV64ZVE32F-NEXT: bnez t3, .LBB57_15 ; RV64ZVE32F-NEXT: .LBB57_7: ; RV64ZVE32F-NEXT: ld t3, 48(a3) -; RV64ZVE32F-NEXT: andi a7, a7, -128 -; RV64ZVE32F-NEXT: bnez a7, .LBB57_16 +; RV64ZVE32F-NEXT: andi a6, a6, -128 +; RV64ZVE32F-NEXT: bnez a6, .LBB57_16 ; RV64ZVE32F-NEXT: .LBB57_8: ; RV64ZVE32F-NEXT: ld a1, 56(a3) ; RV64ZVE32F-NEXT: j .LBB57_17 ; RV64ZVE32F-NEXT: .LBB57_9: ; RV64ZVE32F-NEXT: ld a4, 0(a3) -; RV64ZVE32F-NEXT: andi a5, a7, 2 +; RV64ZVE32F-NEXT: andi a5, a6, 2 ; RV64ZVE32F-NEXT: beqz a5, .LBB57_2 ; RV64ZVE32F-NEXT: .LBB57_10: # %cond.load1 ; RV64ZVE32F-NEXT: ld a5, 8(a2) ; RV64ZVE32F-NEXT: slli a5, a5, 3 ; RV64ZVE32F-NEXT: add a5, a1, a5 ; RV64ZVE32F-NEXT: ld a5, 0(a5) -; RV64ZVE32F-NEXT: andi a6, a7, 4 -; RV64ZVE32F-NEXT: beqz a6, .LBB57_3 +; RV64ZVE32F-NEXT: andi a7, a6, 4 +; RV64ZVE32F-NEXT: beqz a7, .LBB57_3 ; RV64ZVE32F-NEXT: .LBB57_11: # %cond.load4 -; RV64ZVE32F-NEXT: ld a6, 16(a2) -; RV64ZVE32F-NEXT: slli a6, a6, 3 -; RV64ZVE32F-NEXT: add a6, a1, a6 -; RV64ZVE32F-NEXT: ld a6, 0(a6) -; RV64ZVE32F-NEXT: andi t0, a7, 8 +; RV64ZVE32F-NEXT: ld a7, 16(a2) +; RV64ZVE32F-NEXT: slli a7, a7, 3 +; RV64ZVE32F-NEXT: add a7, a1, a7 +; RV64ZVE32F-NEXT: ld a7, 0(a7) +; RV64ZVE32F-NEXT: andi t0, a6, 8 ; RV64ZVE32F-NEXT: beqz t0, .LBB57_4 ; RV64ZVE32F-NEXT: .LBB57_12: # %cond.load7 ; RV64ZVE32F-NEXT: ld t0, 24(a2) ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) -; RV64ZVE32F-NEXT: andi t1, a7, 16 +; RV64ZVE32F-NEXT: andi t1, a6, 16 ; RV64ZVE32F-NEXT: beqz t1, .LBB57_5 ; RV64ZVE32F-NEXT: .LBB57_13: # %cond.load10 ; RV64ZVE32F-NEXT: ld t1, 32(a2) ; RV64ZVE32F-NEXT: slli t1, t1, 3 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) -; RV64ZVE32F-NEXT: andi t2, a7, 32 +; RV64ZVE32F-NEXT: andi t2, a6, 32 ; RV64ZVE32F-NEXT: beqz t2, .LBB57_6 ; RV64ZVE32F-NEXT: .LBB57_14: # %cond.load13 ; RV64ZVE32F-NEXT: ld t2, 40(a2) ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) -; RV64ZVE32F-NEXT: andi t3, a7, 64 +; RV64ZVE32F-NEXT: andi t3, a6, 64 ; RV64ZVE32F-NEXT: beqz t3, .LBB57_7 ; RV64ZVE32F-NEXT: .LBB57_15: # %cond.load16 ; RV64ZVE32F-NEXT: ld t3, 48(a2) ; RV64ZVE32F-NEXT: slli t3, t3, 3 ; RV64ZVE32F-NEXT: add t3, a1, t3 ; RV64ZVE32F-NEXT: ld t3, 0(t3) -; RV64ZVE32F-NEXT: andi a7, a7, -128 -; RV64ZVE32F-NEXT: beqz a7, .LBB57_8 +; RV64ZVE32F-NEXT: andi a6, a6, -128 +; RV64ZVE32F-NEXT: beqz a6, .LBB57_8 ; RV64ZVE32F-NEXT: .LBB57_16: # %cond.load19 ; RV64ZVE32F-NEXT: ld a2, 56(a2) ; RV64ZVE32F-NEXT: slli a2, a2, 3 @@ -6817,7 +6817,7 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m, ; RV64ZVE32F-NEXT: .LBB57_17: # %else20 ; RV64ZVE32F-NEXT: sd a4, 0(a0) ; RV64ZVE32F-NEXT: sd a5, 8(a0) -; RV64ZVE32F-NEXT: sd a6, 16(a0) +; RV64ZVE32F-NEXT: sd a7, 16(a0) ; RV64ZVE32F-NEXT: sd t0, 24(a0) ; RV64ZVE32F-NEXT: sd t1, 32(a0) ; RV64ZVE32F-NEXT: sd t2, 40(a0) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll index 48845c54c5603..bbb57ed99cdcc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll @@ -897,55 +897,54 @@ define void @strided_load_startval_add_with_splat(ptr noalias nocapture %arg, pt ; CHECK-NEXT: beq a2, a3, .LBB14_7 ; CHECK-NEXT: # %bb.1: # %bb3 ; CHECK-NEXT: li a3, 1023 -; CHECK-NEXT: subw a5, a3, a2 -; CHECK-NEXT: li a6, 31 -; CHECK-NEXT: mv a4, a2 -; CHECK-NEXT: bltu a5, a6, .LBB14_5 +; CHECK-NEXT: subw a4, a3, a2 +; CHECK-NEXT: li a5, 31 +; CHECK-NEXT: bltu a4, a5, .LBB14_5 ; CHECK-NEXT: # %bb.2: # %bb9 -; CHECK-NEXT: slli a4, a5, 32 -; CHECK-NEXT: slli t0, a2, 2 -; CHECK-NEXT: add a5, a0, a2 +; CHECK-NEXT: slli a5, a4, 32 +; CHECK-NEXT: slli a7, a2, 2 +; CHECK-NEXT: add a4, a0, a2 ; CHECK-NEXT: add a6, a1, a2 ; CHECK-NEXT: li t2, 32 -; CHECK-NEXT: srli a4, a4, 32 -; CHECK-NEXT: add t0, a6, t0 -; CHECK-NEXT: addi a6, a4, 1 -; CHECK-NEXT: andi a7, a6, -32 -; CHECK-NEXT: add a4, a7, a2 -; CHECK-NEXT: add a2, a4, a0 +; CHECK-NEXT: srli a5, a5, 32 +; CHECK-NEXT: add a7, a6, a7 +; CHECK-NEXT: addi a5, a5, 1 +; CHECK-NEXT: andi a6, a5, -32 +; CHECK-NEXT: add a2, a6, a2 +; CHECK-NEXT: add t0, a2, a0 ; CHECK-NEXT: li t1, 5 ; CHECK-NEXT: vsetvli zero, t2, e8, m1, ta, ma ; CHECK-NEXT: .LBB14_3: # %bb15 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vlse8.v v8, (t0), t1 -; CHECK-NEXT: vle8.v v9, (a5) +; CHECK-NEXT: vlse8.v v8, (a7), t1 +; CHECK-NEXT: vle8.v v9, (a4) ; CHECK-NEXT: vadd.vv v8, v9, v8 -; CHECK-NEXT: vse8.v v8, (a5) -; CHECK-NEXT: addi a5, a5, 32 -; CHECK-NEXT: addi t0, t0, 160 -; CHECK-NEXT: bne a5, a2, .LBB14_3 +; CHECK-NEXT: vse8.v v8, (a4) +; CHECK-NEXT: addi a4, a4, 32 +; CHECK-NEXT: addi a7, a7, 160 +; CHECK-NEXT: bne a4, t0, .LBB14_3 ; CHECK-NEXT: # %bb.4: # %bb30 -; CHECK-NEXT: beq a6, a7, .LBB14_7 +; CHECK-NEXT: beq a5, a6, .LBB14_7 ; CHECK-NEXT: .LBB14_5: # %bb32 -; CHECK-NEXT: add a2, a0, a4 -; CHECK-NEXT: slli a5, a4, 2 -; CHECK-NEXT: add a1, a1, a4 -; CHECK-NEXT: sub a3, a3, a4 +; CHECK-NEXT: add a4, a0, a2 +; CHECK-NEXT: slli a5, a2, 2 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: sub a3, a3, a2 ; CHECK-NEXT: add a1, a1, a5 ; CHECK-NEXT: slli a3, a3, 32 ; CHECK-NEXT: srli a3, a3, 32 -; CHECK-NEXT: add a0, a4, a0 +; CHECK-NEXT: add a0, a2, a0 ; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: addi a0, a0, 1 ; CHECK-NEXT: .LBB14_6: # %bb35 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lbu a3, 0(a1) -; CHECK-NEXT: lbu a4, 0(a2) -; CHECK-NEXT: add a3, a4, a3 -; CHECK-NEXT: sb a3, 0(a2) -; CHECK-NEXT: addi a2, a2, 1 +; CHECK-NEXT: lbu a2, 0(a1) +; CHECK-NEXT: lbu a3, 0(a4) +; CHECK-NEXT: add a2, a3, a2 +; CHECK-NEXT: sb a2, 0(a4) +; CHECK-NEXT: addi a4, a4, 1 ; CHECK-NEXT: addi a1, a1, 5 -; CHECK-NEXT: bne a2, a0, .LBB14_6 +; CHECK-NEXT: bne a4, a0, .LBB14_6 ; CHECK-NEXT: .LBB14_7: # %bb34 ; CHECK-NEXT: ret bb: diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll index 7990dfc0880a5..52f2bd42dc5ea 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll @@ -29,32 +29,30 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_ ; RV32-NEXT: # %bb.3: # %for.cond1.preheader.us.preheader ; RV32-NEXT: li t0, 32 ; RV32-NEXT: # %bb.4: # %for.cond1.preheader.us.preheader -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s2, 4(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset s0, -4 -; RV32-NEXT: .cfi_offset s1, -8 -; RV32-NEXT: .cfi_offset s2, -12 -; RV32-NEXT: .cfi_remember_state ; RV32-NEXT: add t3, a0, t3 ; RV32-NEXT: add t4, a2, t4 -; RV32-NEXT: add s0, a4, t5 +; RV32-NEXT: add t5, a4, t5 ; RV32-NEXT: bltu t6, t1, .LBB0_6 ; RV32-NEXT: # %bb.5: # %for.cond1.preheader.us.preheader ; RV32-NEXT: li t1, 32 ; RV32-NEXT: .LBB0_6: # %for.cond1.preheader.us.preheader ; RV32-NEXT: add t3, t3, a6 -; RV32-NEXT: add t5, t4, a6 -; RV32-NEXT: add t4, s0, a6 +; RV32-NEXT: add t6, t4, a6 +; RV32-NEXT: add t4, t5, a6 ; RV32-NEXT: j .LBB0_8 ; RV32-NEXT: # %bb.7: # %for.cond1.preheader.us.preheader ; RV32-NEXT: mv t1, t0 ; RV32-NEXT: .LBB0_8: # %for.cond1.preheader.us.preheader -; RV32-NEXT: .cfi_restore_state +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 4(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset s0, -4 +; RV32-NEXT: .cfi_offset s1, -8 +; RV32-NEXT: .cfi_offset s2, -12 ; RV32-NEXT: li t0, 0 -; RV32-NEXT: sltu t5, a0, t5 +; RV32-NEXT: sltu t5, a0, t6 ; RV32-NEXT: sltu t6, a2, t3 ; RV32-NEXT: and t5, t5, t6 ; RV32-NEXT: sltu t4, a0, t4 diff --git a/llvm/test/CodeGen/RISCV/xcvbi.ll b/llvm/test/CodeGen/RISCV/xcvbi.ll index ca2e416e334f0..d5d11585970b0 100644 --- a/llvm/test/CodeGen/RISCV/xcvbi.ll +++ b/llvm/test/CodeGen/RISCV/xcvbi.ll @@ -67,14 +67,14 @@ define i32 @select_beqimm_1(i32 %a, i32 %x, i32 %y) { ; CHECK_NOPT: # %bb.0: # %entry ; CHECK_NOPT-NEXT: addi sp, sp, -16 ; CHECK_NOPT-NEXT: .cfi_def_cfa_offset 16 -; CHECK_NOPT-NEXT: sw a1, 8(sp) # 4-byte Folded Spill -; CHECK_NOPT-NEXT: sw a2, 12(sp) # 4-byte Folded Spill +; CHECK_NOPT-NEXT: sw a2, 8(sp) # 4-byte Folded Spill +; CHECK_NOPT-NEXT: sw a1, 12(sp) # 4-byte Folded Spill ; CHECK_NOPT-NEXT: cv.beqimm a0, -16, .LBB2_2 ; CHECK_NOPT-NEXT: # %bb.1: # %entry -; CHECK_NOPT-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; CHECK_NOPT-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; CHECK_NOPT-NEXT: .LBB2_2: # %entry ; CHECK_NOPT-NEXT: lw a0, 12(sp) # 4-byte Folded Reload +; CHECK_NOPT-NEXT: sw a0, 8(sp) # 4-byte Folded Spill +; CHECK_NOPT-NEXT: .LBB2_2: # %entry +; CHECK_NOPT-NEXT: lw a0, 8(sp) # 4-byte Folded Reload ; CHECK_NOPT-NEXT: addi sp, sp, 16 ; CHECK_NOPT-NEXT: .cfi_def_cfa_offset 0 ; CHECK_NOPT-NEXT: ret @@ -98,14 +98,14 @@ define i32 @select_beqimm_2(i32 %a, i32 %x, i32 %y) { ; CHECK_NOPT: # %bb.0: # %entry ; CHECK_NOPT-NEXT: addi sp, sp, -16 ; CHECK_NOPT-NEXT: .cfi_def_cfa_offset 16 -; CHECK_NOPT-NEXT: sw a1, 8(sp) # 4-byte Folded Spill -; CHECK_NOPT-NEXT: sw a2, 12(sp) # 4-byte Folded Spill +; CHECK_NOPT-NEXT: sw a2, 8(sp) # 4-byte Folded Spill +; CHECK_NOPT-NEXT: sw a1, 12(sp) # 4-byte Folded Spill ; CHECK_NOPT-NEXT: cv.beqimm a0, 0, .LBB3_2 ; CHECK_NOPT-NEXT: # %bb.1: # %entry -; CHECK_NOPT-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; CHECK_NOPT-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; CHECK_NOPT-NEXT: .LBB3_2: # %entry ; CHECK_NOPT-NEXT: lw a0, 12(sp) # 4-byte Folded Reload +; CHECK_NOPT-NEXT: sw a0, 8(sp) # 4-byte Folded Spill +; CHECK_NOPT-NEXT: .LBB3_2: # %entry +; CHECK_NOPT-NEXT: lw a0, 8(sp) # 4-byte Folded Reload ; CHECK_NOPT-NEXT: addi sp, sp, 16 ; CHECK_NOPT-NEXT: .cfi_def_cfa_offset 0 ; CHECK_NOPT-NEXT: ret @@ -129,14 +129,14 @@ define i32 @select_beqimm_3(i32 %a, i32 %x, i32 %y) { ; CHECK_NOPT: # %bb.0: # %entry ; CHECK_NOPT-NEXT: addi sp, sp, -16 ; CHECK_NOPT-NEXT: .cfi_def_cfa_offset 16 -; CHECK_NOPT-NEXT: sw a1, 8(sp) # 4-byte Folded Spill -; CHECK_NOPT-NEXT: sw a2, 12(sp) # 4-byte Folded Spill +; CHECK_NOPT-NEXT: sw a2, 8(sp) # 4-byte Folded Spill +; CHECK_NOPT-NEXT: sw a1, 12(sp) # 4-byte Folded Spill ; CHECK_NOPT-NEXT: cv.beqimm a0, 15, .LBB4_2 ; CHECK_NOPT-NEXT: # %bb.1: # %entry -; CHECK_NOPT-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; CHECK_NOPT-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; CHECK_NOPT-NEXT: .LBB4_2: # %entry ; CHECK_NOPT-NEXT: lw a0, 12(sp) # 4-byte Folded Reload +; CHECK_NOPT-NEXT: sw a0, 8(sp) # 4-byte Folded Spill +; CHECK_NOPT-NEXT: .LBB4_2: # %entry +; CHECK_NOPT-NEXT: lw a0, 8(sp) # 4-byte Folded Reload ; CHECK_NOPT-NEXT: addi sp, sp, 16 ; CHECK_NOPT-NEXT: .cfi_def_cfa_offset 0 ; CHECK_NOPT-NEXT: ret @@ -160,15 +160,15 @@ define i32 @select_no_beqimm_1(i32 %a, i32 %x, i32 %y) { ; CHECK_NOPT: # %bb.0: # %entry ; CHECK_NOPT-NEXT: addi sp, sp, -16 ; CHECK_NOPT-NEXT: .cfi_def_cfa_offset 16 -; CHECK_NOPT-NEXT: sw a1, 8(sp) # 4-byte Folded Spill +; CHECK_NOPT-NEXT: sw a2, 8(sp) # 4-byte Folded Spill +; CHECK_NOPT-NEXT: sw a1, 12(sp) # 4-byte Folded Spill ; CHECK_NOPT-NEXT: li a1, -17 -; CHECK_NOPT-NEXT: sw a2, 12(sp) # 4-byte Folded Spill ; CHECK_NOPT-NEXT: beq a0, a1, .LBB5_2 ; CHECK_NOPT-NEXT: # %bb.1: # %entry -; CHECK_NOPT-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; CHECK_NOPT-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; CHECK_NOPT-NEXT: .LBB5_2: # %entry ; CHECK_NOPT-NEXT: lw a0, 12(sp) # 4-byte Folded Reload +; CHECK_NOPT-NEXT: sw a0, 8(sp) # 4-byte Folded Spill +; CHECK_NOPT-NEXT: .LBB5_2: # %entry +; CHECK_NOPT-NEXT: lw a0, 8(sp) # 4-byte Folded Reload ; CHECK_NOPT-NEXT: addi sp, sp, 16 ; CHECK_NOPT-NEXT: .cfi_def_cfa_offset 0 ; CHECK_NOPT-NEXT: ret @@ -193,15 +193,15 @@ define i32 @select_no_beqimm_2(i32 %a, i32 %x, i32 %y) { ; CHECK_NOPT: # %bb.0: # %entry ; CHECK_NOPT-NEXT: addi sp, sp, -16 ; CHECK_NOPT-NEXT: .cfi_def_cfa_offset 16 -; CHECK_NOPT-NEXT: sw a1, 8(sp) # 4-byte Folded Spill +; CHECK_NOPT-NEXT: sw a2, 8(sp) # 4-byte Folded Spill +; CHECK_NOPT-NEXT: sw a1, 12(sp) # 4-byte Folded Spill ; CHECK_NOPT-NEXT: li a1, 16 -; CHECK_NOPT-NEXT: sw a2, 12(sp) # 4-byte Folded Spill ; CHECK_NOPT-NEXT: beq a0, a1, .LBB6_2 ; CHECK_NOPT-NEXT: # %bb.1: # %entry -; CHECK_NOPT-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; CHECK_NOPT-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; CHECK_NOPT-NEXT: .LBB6_2: # %entry ; CHECK_NOPT-NEXT: lw a0, 12(sp) # 4-byte Folded Reload +; CHECK_NOPT-NEXT: sw a0, 8(sp) # 4-byte Folded Spill +; CHECK_NOPT-NEXT: .LBB6_2: # %entry +; CHECK_NOPT-NEXT: lw a0, 8(sp) # 4-byte Folded Reload ; CHECK_NOPT-NEXT: addi sp, sp, 16 ; CHECK_NOPT-NEXT: .cfi_def_cfa_offset 0 ; CHECK_NOPT-NEXT: ret @@ -226,14 +226,14 @@ define i32 @select_bneimm_1(i32 %a, i32 %x, i32 %y) { ; CHECK_NOPT: # %bb.0: # %entry ; CHECK_NOPT-NEXT: addi sp, sp, -16 ; CHECK_NOPT-NEXT: .cfi_def_cfa_offset 16 -; CHECK_NOPT-NEXT: sw a1, 8(sp) # 4-byte Folded Spill -; CHECK_NOPT-NEXT: sw a2, 12(sp) # 4-byte Folded Spill +; CHECK_NOPT-NEXT: sw a2, 8(sp) # 4-byte Folded Spill +; CHECK_NOPT-NEXT: sw a1, 12(sp) # 4-byte Folded Spill ; CHECK_NOPT-NEXT: cv.bneimm a0, 0, .LBB7_2 ; CHECK_NOPT-NEXT: # %bb.1: # %entry -; CHECK_NOPT-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; CHECK_NOPT-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; CHECK_NOPT-NEXT: .LBB7_2: # %entry ; CHECK_NOPT-NEXT: lw a0, 12(sp) # 4-byte Folded Reload +; CHECK_NOPT-NEXT: sw a0, 8(sp) # 4-byte Folded Spill +; CHECK_NOPT-NEXT: .LBB7_2: # %entry +; CHECK_NOPT-NEXT: lw a0, 8(sp) # 4-byte Folded Reload ; CHECK_NOPT-NEXT: addi sp, sp, 16 ; CHECK_NOPT-NEXT: .cfi_def_cfa_offset 0 ; CHECK_NOPT-NEXT: ret diff --git a/llvm/test/CodeGen/SystemZ/swifterror.ll b/llvm/test/CodeGen/SystemZ/swifterror.ll index 1b18287cac146..39f0907295ff4 100644 --- a/llvm/test/CodeGen/SystemZ/swifterror.ll +++ b/llvm/test/CodeGen/SystemZ/swifterror.ll @@ -162,8 +162,8 @@ define float @foo_loop(ptr swifterror %error_ptr_ref, i32 %cc, float %cc2) { ; CHECK-O0: je ; CHECK-O0: lghi %r2, 16 ; CHECK-O0: brasl %r14, malloc -; CHECK-O0: lgr %r[[REG1:[0-9]+]], %r2 -; CHECK-O0: mvi 8(%r[[REG1]]), 1 +; CHECK-O0: lgr %r{{[0-9]+}}, %r2 +; CHECK-O0: mvi 8(%r2), 1 ; CHECK-O0: jnh ; reload from stack ; CHECK-O0: lg %r9, [[OFFS:[0-9]+]](%r15) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll index 8a5a15a57912c..08b99c67d9d55 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -328,14 +328,14 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(ptr nocapture readonly ; CHECK-LABEL: test_vec_mul_scalar_add_char: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: ldr r4, [sp, #28] -; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: ldr.w r12, [sp, #28] +; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq.w .LBB5_11 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph -; CHECK-NEXT: adds r7, r1, r4 -; CHECK-NEXT: add.w r6, r3, r4, lsl #2 +; CHECK-NEXT: add.w r7, r1, r12 +; CHECK-NEXT: add.w r6, r3, r12, lsl #2 ; CHECK-NEXT: cmp r7, r3 -; CHECK-NEXT: add.w r5, r0, r4 +; CHECK-NEXT: add.w r5, r0, r12 ; CHECK-NEXT: cset r7, hi ; CHECK-NEXT: cmp r6, r1 ; CHECK-NEXT: csel r7, zr, r7, ls @@ -348,15 +348,15 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(ptr nocapture readonly ; CHECK-NEXT: cmpeq r7, #0 ; CHECK-NEXT: beq .LBB5_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader -; CHECK-NEXT: and r12, r4, #3 -; CHECK-NEXT: subs r7, r4, #1 +; CHECK-NEXT: and r8, r12, #3 +; CHECK-NEXT: sub.w r7, r12, #1 ; CHECK-NEXT: cmp r7, #3 ; CHECK-NEXT: bhs .LBB5_6 ; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB5_8 ; CHECK-NEXT: .LBB5_4: @ %vector.ph -; CHECK-NEXT: dlstp.32 lr, r4 +; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB5_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrb.u32 q0, [r0], #4 @@ -366,18 +366,18 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(ptr nocapture readonly ; CHECK-NEXT: letp lr, .LBB5_5 ; CHECK-NEXT: b .LBB5_11 ; CHECK-NEXT: .LBB5_6: @ %for.body.preheader.new -; CHECK-NEXT: bic r7, r4, #3 +; CHECK-NEXT: bic r7, r12, #3 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: subs r7, #4 ; CHECK-NEXT: add.w r5, r3, #8 -; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2 ; CHECK-NEXT: adds r6, r0, #3 ; CHECK-NEXT: adds r7, r1, #1 ; CHECK-NEXT: .LBB5_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb r9, [r6, #-3] -; CHECK-NEXT: add.w r8, r8, #4 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: ldrb r4, [r7, #-1] ; CHECK-NEXT: smlabb r4, r4, r9, r2 ; CHECK-NEXT: str r4, [r5, #-8] @@ -396,11 +396,11 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(ptr nocapture readonly ; CHECK-NEXT: adds r5, #16 ; CHECK-NEXT: le lr, .LBB5_7 ; CHECK-NEXT: .LBB5_8: @ %for.cond.cleanup.loopexit.unr-lcssa -; CHECK-NEXT: wls lr, r12, .LBB5_11 +; CHECK-NEXT: wls lr, r8, .LBB5_11 ; CHECK-NEXT: @ %bb.9: @ %for.body.epil.preheader -; CHECK-NEXT: add r0, r8 -; CHECK-NEXT: add r1, r8 -; CHECK-NEXT: add.w r3, r3, r8, lsl #2 +; CHECK-NEXT: add r0, r12 +; CHECK-NEXT: add r1, r12 +; CHECK-NEXT: add.w r3, r3, r12, lsl #2 ; CHECK-NEXT: .LBB5_10: @ %for.body.epil ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb r7, [r0], #1 @@ -604,14 +604,14 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(ptr nocapture readonl ; CHECK-LABEL: test_vec_mul_scalar_add_uchar: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: ldr r4, [sp, #28] -; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: ldr.w r12, [sp, #28] +; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq.w .LBB7_11 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph -; CHECK-NEXT: adds r7, r1, r4 -; CHECK-NEXT: add.w r6, r3, r4, lsl #2 +; CHECK-NEXT: add.w r7, r1, r12 +; CHECK-NEXT: add.w r6, r3, r12, lsl #2 ; CHECK-NEXT: cmp r7, r3 -; CHECK-NEXT: add.w r5, r0, r4 +; CHECK-NEXT: add.w r5, r0, r12 ; CHECK-NEXT: cset r7, hi ; CHECK-NEXT: cmp r6, r1 ; CHECK-NEXT: csel r7, zr, r7, ls @@ -624,15 +624,15 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(ptr nocapture readonl ; CHECK-NEXT: cmpeq r7, #0 ; CHECK-NEXT: beq .LBB7_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader -; CHECK-NEXT: and r12, r4, #3 -; CHECK-NEXT: subs r7, r4, #1 +; CHECK-NEXT: and r8, r12, #3 +; CHECK-NEXT: sub.w r7, r12, #1 ; CHECK-NEXT: cmp r7, #3 ; CHECK-NEXT: bhs .LBB7_6 ; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB7_8 ; CHECK-NEXT: .LBB7_4: @ %vector.ph -; CHECK-NEXT: dlstp.32 lr, r4 +; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB7_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrb.u32 q0, [r0], #4 @@ -642,18 +642,18 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(ptr nocapture readonl ; CHECK-NEXT: letp lr, .LBB7_5 ; CHECK-NEXT: b .LBB7_11 ; CHECK-NEXT: .LBB7_6: @ %for.body.preheader.new -; CHECK-NEXT: bic r7, r4, #3 +; CHECK-NEXT: bic r7, r12, #3 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: subs r7, #4 ; CHECK-NEXT: add.w r5, r3, #8 -; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2 ; CHECK-NEXT: adds r6, r0, #3 ; CHECK-NEXT: adds r7, r1, #1 ; CHECK-NEXT: .LBB7_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb r9, [r6, #-3] -; CHECK-NEXT: add.w r8, r8, #4 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: ldrb r4, [r7, #-1] ; CHECK-NEXT: smlabb r4, r4, r9, r2 ; CHECK-NEXT: str r4, [r5, #-8] @@ -672,11 +672,11 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(ptr nocapture readonl ; CHECK-NEXT: adds r5, #16 ; CHECK-NEXT: le lr, .LBB7_7 ; CHECK-NEXT: .LBB7_8: @ %for.cond.cleanup.loopexit.unr-lcssa -; CHECK-NEXT: wls lr, r12, .LBB7_11 +; CHECK-NEXT: wls lr, r8, .LBB7_11 ; CHECK-NEXT: @ %bb.9: @ %for.body.epil.preheader -; CHECK-NEXT: add r0, r8 -; CHECK-NEXT: add r1, r8 -; CHECK-NEXT: add.w r3, r3, r8, lsl #2 +; CHECK-NEXT: add r0, r12 +; CHECK-NEXT: add r1, r12 +; CHECK-NEXT: add.w r3, r3, r12, lsl #2 ; CHECK-NEXT: .LBB7_10: @ %for.body.epil ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb r7, [r0], #1 @@ -880,14 +880,14 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(ptr nocapture readonly ; CHECK-LABEL: test_vec_mul_scalar_add_int: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: ldr r4, [sp, #28] -; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: ldr.w r12, [sp, #28] +; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq.w .LBB9_11 ; CHECK-NEXT: @ %bb.1: @ %vector.memcheck -; CHECK-NEXT: add.w r7, r1, r4, lsl #2 -; CHECK-NEXT: add.w r6, r3, r4, lsl #2 +; CHECK-NEXT: add.w r7, r1, r12, lsl #2 +; CHECK-NEXT: add.w r6, r3, r12, lsl #2 ; CHECK-NEXT: cmp r7, r3 -; CHECK-NEXT: add.w r5, r0, r4, lsl #2 +; CHECK-NEXT: add.w r5, r0, r12, lsl #2 ; CHECK-NEXT: cset r7, hi ; CHECK-NEXT: cmp r6, r1 ; CHECK-NEXT: csel r7, zr, r7, ls @@ -900,15 +900,15 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(ptr nocapture readonly ; CHECK-NEXT: cmpeq r7, #0 ; CHECK-NEXT: beq .LBB9_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader -; CHECK-NEXT: and r12, r4, #3 -; CHECK-NEXT: subs r7, r4, #1 +; CHECK-NEXT: and r8, r12, #3 +; CHECK-NEXT: sub.w r7, r12, #1 ; CHECK-NEXT: cmp r7, #3 ; CHECK-NEXT: bhs .LBB9_6 ; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB9_8 ; CHECK-NEXT: .LBB9_4: @ %vector.ph -; CHECK-NEXT: dlstp.32 lr, r4 +; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB9_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 @@ -918,18 +918,18 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(ptr nocapture readonly ; CHECK-NEXT: letp lr, .LBB9_5 ; CHECK-NEXT: b .LBB9_11 ; CHECK-NEXT: .LBB9_6: @ %for.body.preheader.new -; CHECK-NEXT: bic r7, r4, #3 +; CHECK-NEXT: bic r7, r12, #3 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: subs r7, #4 ; CHECK-NEXT: add.w r5, r3, #8 -; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2 ; CHECK-NEXT: add.w r6, r0, #8 ; CHECK-NEXT: add.w r7, r1, #8 ; CHECK-NEXT: .LBB9_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r9, [r6, #-8] -; CHECK-NEXT: add.w r8, r8, #4 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: ldr r4, [r7, #-8] ; CHECK-NEXT: mla r4, r4, r9, r2 ; CHECK-NEXT: str r4, [r5, #-8] @@ -950,11 +950,11 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(ptr nocapture readonly ; CHECK-NEXT: adds r5, #16 ; CHECK-NEXT: le lr, .LBB9_7 ; CHECK-NEXT: .LBB9_8: @ %for.cond.cleanup.loopexit.unr-lcssa -; CHECK-NEXT: wls lr, r12, .LBB9_11 +; CHECK-NEXT: wls lr, r8, .LBB9_11 ; CHECK-NEXT: @ %bb.9: @ %for.body.epil.preheader -; CHECK-NEXT: add.w r0, r0, r8, lsl #2 -; CHECK-NEXT: add.w r1, r1, r8, lsl #2 -; CHECK-NEXT: add.w r3, r3, r8, lsl #2 +; CHECK-NEXT: add.w r0, r0, r12, lsl #2 +; CHECK-NEXT: add.w r1, r1, r12, lsl #2 +; CHECK-NEXT: add.w r3, r3, r12, lsl #2 ; CHECK-NEXT: .LBB9_10: @ %for.body.epil ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r7, [r0], #4 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll index c6158cb611a70..4527576322606 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll @@ -10,10 +10,10 @@ define arm_aapcs_vfpcc void @test(ptr noalias nocapture readonly %off, ptr noali ; CHECK-NEXT: .LBB0_1: @ %for.cond1.preheader.us.preheader ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEXT: sub.w r9, r1, #2 -; CHECK-NEXT: sub.w r8, r0, #2 +; CHECK-NEXT: sub.w r12, r0, #2 ; CHECK-NEXT: subs r5, r2, #2 ; CHECK-NEXT: mov r10, r3 -; CHECK-NEXT: lsl.w r12, r3, #1 +; CHECK-NEXT: lsl.w r8, r3, #1 ; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: mov r4, r9 ; CHECK-NEXT: .LBB0_2: @ %for.cond1.preheader.us @@ -21,7 +21,7 @@ define arm_aapcs_vfpcc void @test(ptr noalias nocapture readonly %off, ptr noali ; CHECK-NEXT: @ Child Loop BB0_3 Depth 2 ; CHECK-NEXT: @ Child Loop BB0_5 Depth 2 ; CHECK-NEXT: dls lr, r10 -; CHECK-NEXT: mov r6, r8 +; CHECK-NEXT: mov r6, r12 ; CHECK-NEXT: mov r7, r9 ; CHECK-NEXT: mov r2, r4 ; CHECK-NEXT: .LBB0_3: @ %for.body4.us @@ -35,7 +35,7 @@ define arm_aapcs_vfpcc void @test(ptr noalias nocapture readonly %off, ptr noali ; CHECK-NEXT: @ %bb.4: @ %for.body15.us.preheader ; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: dls lr, r10 -; CHECK-NEXT: mov r6, r8 +; CHECK-NEXT: mov r6, r12 ; CHECK-NEXT: mov r7, r9 ; CHECK-NEXT: mov r2, r5 ; CHECK-NEXT: .LBB0_5: @ %for.body15.us @@ -49,8 +49,8 @@ define arm_aapcs_vfpcc void @test(ptr noalias nocapture readonly %off, ptr noali ; CHECK-NEXT: @ %bb.6: @ %for.cond.cleanup14.us ; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: adds r0, #1 -; CHECK-NEXT: add r5, r12 -; CHECK-NEXT: add r4, r12 +; CHECK-NEXT: add r5, r8 +; CHECK-NEXT: add r4, r8 ; CHECK-NEXT: cmp r0, r10 ; CHECK-NEXT: bne .LBB0_2 ; CHECK-NEXT: @ %bb.7: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll index a0e690212d5a4..7acc83343dcb8 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll @@ -17,18 +17,18 @@ define dso_local void @check_option(ptr noalias nocapture %A, ptr noalias nocapt ; ENABLED-NEXT: .LBB0_2: @ %vector.ph ; ENABLED-NEXT: @ =>This Loop Header: Depth=1 ; ENABLED-NEXT: @ Child Loop BB0_3 Depth 2 -; ENABLED-NEXT: mov r12, r0 -; ENABLED-NEXT: mov r4, r2 -; ENABLED-NEXT: mov r5, r1 -; ENABLED-NEXT: mov r6, r3 -; ENABLED-NEXT: dlstp.32 lr, r6 +; ENABLED-NEXT: mov r4, r0 +; ENABLED-NEXT: mov r5, r2 +; ENABLED-NEXT: mov r6, r1 +; ENABLED-NEXT: mov r7, r3 +; ENABLED-NEXT: dlstp.32 lr, r7 ; ENABLED-NEXT: .LBB0_3: @ %vector.body ; ENABLED-NEXT: @ Parent Loop BB0_2 Depth=1 ; ENABLED-NEXT: @ => This Inner Loop Header: Depth=2 -; ENABLED-NEXT: vldrw.u32 q0, [r5], #16 -; ENABLED-NEXT: vldrw.u32 q1, [r4], #16 +; ENABLED-NEXT: vldrw.u32 q0, [r6], #16 +; ENABLED-NEXT: vldrw.u32 q1, [r5], #16 ; ENABLED-NEXT: vadd.i32 q0, q1, q0 -; ENABLED-NEXT: vstrw.32 q0, [r12], #16 +; ENABLED-NEXT: vstrw.32 q0, [r4], #16 ; ENABLED-NEXT: letp lr, .LBB0_3 ; ENABLED-NEXT: b .LBB0_2 ; ENABLED-NEXT: .LBB0_4: @ %for.cond.cleanup @@ -44,29 +44,29 @@ define dso_local void @check_option(ptr noalias nocapture %A, ptr noalias nocapt ; DISABLED-NEXT: movs r6, #1 ; DISABLED-NEXT: bic r7, r7, #3 ; DISABLED-NEXT: subs r7, #4 -; DISABLED-NEXT: add.w r8, r6, r7, lsr #2 +; DISABLED-NEXT: add.w r12, r6, r7, lsr #2 ; DISABLED-NEXT: .LBB0_2: @ %vector.ph ; DISABLED-NEXT: @ =>This Loop Header: Depth=1 ; DISABLED-NEXT: @ Child Loop BB0_3 Depth 2 -; DISABLED-NEXT: mov r7, r8 -; DISABLED-NEXT: mov r12, r0 -; DISABLED-NEXT: mov r4, r2 -; DISABLED-NEXT: mov r5, r1 -; DISABLED-NEXT: mov r6, r3 -; DISABLED-NEXT: dls lr, r8 +; DISABLED-NEXT: mov r8, r12 +; DISABLED-NEXT: mov r4, r0 +; DISABLED-NEXT: mov r5, r2 +; DISABLED-NEXT: mov r6, r1 +; DISABLED-NEXT: mov r7, r3 +; DISABLED-NEXT: dls lr, r12 ; DISABLED-NEXT: .LBB0_3: @ %vector.body ; DISABLED-NEXT: @ Parent Loop BB0_2 Depth=1 ; DISABLED-NEXT: @ => This Inner Loop Header: Depth=2 -; DISABLED-NEXT: vctp.32 r6 -; DISABLED-NEXT: mov lr, r7 +; DISABLED-NEXT: vctp.32 r7 +; DISABLED-NEXT: mov lr, r8 ; DISABLED-NEXT: vpstt -; DISABLED-NEXT: vldrwt.u32 q0, [r5], #16 -; DISABLED-NEXT: vldrwt.u32 q1, [r4], #16 -; DISABLED-NEXT: subs r7, #1 -; DISABLED-NEXT: subs r6, #4 +; DISABLED-NEXT: vldrwt.u32 q0, [r6], #16 +; DISABLED-NEXT: vldrwt.u32 q1, [r5], #16 +; DISABLED-NEXT: sub.w r8, r8, #1 +; DISABLED-NEXT: subs r7, #4 ; DISABLED-NEXT: vadd.i32 q0, q1, q0 ; DISABLED-NEXT: vpst -; DISABLED-NEXT: vstrwt.32 q0, [r12], #16 +; DISABLED-NEXT: vstrwt.32 q0, [r4], #16 ; DISABLED-NEXT: le lr, .LBB0_3 ; DISABLED-NEXT: b .LBB0_2 ; DISABLED-NEXT: .LBB0_4: @ %for.cond.cleanup diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll index 07c06e10979cd..736d5956b6194 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll @@ -29,7 +29,7 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; ENABLED-NEXT: mov.w r8, #0 ; ENABLED-NEXT: mov r9, r12 ; ENABLED-NEXT: uxth r0, r0 -; ENABLED-NEXT: rsbs r5, r0, #0 +; ENABLED-NEXT: rsbs r6, r0, #0 ; ENABLED-NEXT: b .LBB0_4 ; ENABLED-NEXT: .LBB0_2: @ in Loop: Header=BB0_4 Depth=1 ; ENABLED-NEXT: movs r0, #0 @@ -52,9 +52,9 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; ENABLED-NEXT: bic r0, r9, #3 ; ENABLED-NEXT: movs r7, #1 ; ENABLED-NEXT: subs r0, #4 -; ENABLED-NEXT: sub.w r4, r2, r8 +; ENABLED-NEXT: sub.w r5, r2, r8 ; ENABLED-NEXT: vmov.i32 q1, #0x0 -; ENABLED-NEXT: add.w r6, r7, r0, lsr #2 +; ENABLED-NEXT: add.w r4, r7, r0, lsr #2 ; ENABLED-NEXT: sub.w r0, r12, r8 ; ENABLED-NEXT: bic r0, r0, #3 ; ENABLED-NEXT: subs r0, #4 @@ -65,16 +65,16 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; ENABLED-NEXT: .LBB0_6: @ %vector.body ; ENABLED-NEXT: @ Parent Loop BB0_4 Depth=1 ; ENABLED-NEXT: @ => This Inner Loop Header: Depth=2 -; ENABLED-NEXT: vctp.32 r4 +; ENABLED-NEXT: vctp.32 r5 ; ENABLED-NEXT: vmov q0, q1 ; ENABLED-NEXT: vpstt ; ENABLED-NEXT: vldrht.s32 q1, [r0], #8 ; ENABLED-NEXT: vldrht.s32 q2, [r7], #8 -; ENABLED-NEXT: mov lr, r6 -; ENABLED-NEXT: subs r6, #1 +; ENABLED-NEXT: mov lr, r4 +; ENABLED-NEXT: subs r4, #1 ; ENABLED-NEXT: vmul.i32 q1, q2, q1 -; ENABLED-NEXT: subs r4, #4 -; ENABLED-NEXT: vshl.s32 q1, r5 +; ENABLED-NEXT: subs r5, #4 +; ENABLED-NEXT: vshl.s32 q1, r6 ; ENABLED-NEXT: vadd.i32 q1, q1, q0 ; ENABLED-NEXT: le lr, .LBB0_6 ; ENABLED-NEXT: @ %bb.7: @ %middle.block @@ -100,7 +100,7 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; NOREDUCTIONS-NEXT: mov.w r8, #0 ; NOREDUCTIONS-NEXT: mov r9, r12 ; NOREDUCTIONS-NEXT: uxth r0, r0 -; NOREDUCTIONS-NEXT: rsbs r5, r0, #0 +; NOREDUCTIONS-NEXT: rsbs r6, r0, #0 ; NOREDUCTIONS-NEXT: b .LBB0_4 ; NOREDUCTIONS-NEXT: .LBB0_2: @ in Loop: Header=BB0_4 Depth=1 ; NOREDUCTIONS-NEXT: movs r0, #0 @@ -123,9 +123,9 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; NOREDUCTIONS-NEXT: bic r0, r9, #3 ; NOREDUCTIONS-NEXT: movs r7, #1 ; NOREDUCTIONS-NEXT: subs r0, #4 -; NOREDUCTIONS-NEXT: sub.w r4, r2, r8 +; NOREDUCTIONS-NEXT: sub.w r5, r2, r8 ; NOREDUCTIONS-NEXT: vmov.i32 q1, #0x0 -; NOREDUCTIONS-NEXT: add.w r6, r7, r0, lsr #2 +; NOREDUCTIONS-NEXT: add.w r4, r7, r0, lsr #2 ; NOREDUCTIONS-NEXT: sub.w r0, r12, r8 ; NOREDUCTIONS-NEXT: bic r0, r0, #3 ; NOREDUCTIONS-NEXT: subs r0, #4 @@ -136,16 +136,16 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; NOREDUCTIONS-NEXT: .LBB0_6: @ %vector.body ; NOREDUCTIONS-NEXT: @ Parent Loop BB0_4 Depth=1 ; NOREDUCTIONS-NEXT: @ => This Inner Loop Header: Depth=2 -; NOREDUCTIONS-NEXT: vctp.32 r4 +; NOREDUCTIONS-NEXT: vctp.32 r5 ; NOREDUCTIONS-NEXT: vmov q0, q1 ; NOREDUCTIONS-NEXT: vpstt ; NOREDUCTIONS-NEXT: vldrht.s32 q1, [r0], #8 ; NOREDUCTIONS-NEXT: vldrht.s32 q2, [r7], #8 -; NOREDUCTIONS-NEXT: mov lr, r6 -; NOREDUCTIONS-NEXT: subs r6, #1 +; NOREDUCTIONS-NEXT: mov lr, r4 +; NOREDUCTIONS-NEXT: subs r4, #1 ; NOREDUCTIONS-NEXT: vmul.i32 q1, q2, q1 -; NOREDUCTIONS-NEXT: subs r4, #4 -; NOREDUCTIONS-NEXT: vshl.s32 q1, r5 +; NOREDUCTIONS-NEXT: subs r5, #4 +; NOREDUCTIONS-NEXT: vshl.s32 q1, r6 ; NOREDUCTIONS-NEXT: vadd.i32 q1, q1, q0 ; NOREDUCTIONS-NEXT: le lr, .LBB0_6 ; NOREDUCTIONS-NEXT: @ %bb.7: @ %middle.block diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll index cbcbf1f392ce8..a6a9361050731 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll @@ -165,74 +165,69 @@ define dso_local i32 @b(ptr %c, i32 %d, i32 %e, ptr %n) "frame-pointer"="all" { ; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: wls lr, r1, .LBB2_3 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader -; CHECK-NEXT: adds r6, r3, #4 -; CHECK-NEXT: adds r1, r0, #4 -; CHECK-NEXT: mvn r8, #1 -; CHECK-NEXT: @ implicit-def: $r9 -; CHECK-NEXT: @ implicit-def: $r4 +; CHECK-NEXT: add.w r12, r3, #4 +; CHECK-NEXT: add.w r9, r0, #4 +; CHECK-NEXT: mvn r10, #1 +; CHECK-NEXT: @ implicit-def: $r6 +; CHECK-NEXT: @ implicit-def: $r8 ; CHECK-NEXT: str r2, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB2_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: asrs r2, r4, #31 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: ldr r1, [r1] +; CHECK-NEXT: ldr.w r1, [r9] +; CHECK-NEXT: asr.w r2, r8, #31 +; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: muls r1, r3, r1 -; CHECK-NEXT: adds r4, r4, r1 +; CHECK-NEXT: adds.w r5, r8, r1 ; CHECK-NEXT: adc.w r1, r2, r1, asr #31 -; CHECK-NEXT: adds.w r2, r4, #-2147483648 -; CHECK-NEXT: ldrd r2, r4, [r8] +; CHECK-NEXT: adds.w r2, r5, #-2147483648 +; CHECK-NEXT: ldrd r2, r5, [r10] +; CHECK-NEXT: adc r8, r1, #0 +; CHECK-NEXT: asr.w r1, r8, #31 +; CHECK-NEXT: strd r6, r2, [sp, #4] @ 8-byte Folded Spill +; CHECK-NEXT: smull r5, r6, r5, r6 +; CHECK-NEXT: subs.w r5, r8, r5 +; CHECK-NEXT: sbcs r1, r6 +; CHECK-NEXT: adds.w r6, r5, #-2147483648 ; CHECK-NEXT: adc r5, r1, #0 -; CHECK-NEXT: str r2, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: smull r4, r2, r4, r9 -; CHECK-NEXT: asrs r1, r5, #31 -; CHECK-NEXT: str r5, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: subs r4, r5, r4 -; CHECK-NEXT: sbcs r1, r2 -; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: adds.w r10, r4, #-2147483648 -; CHECK-NEXT: adc r1, r1, #0 -; CHECK-NEXT: ldr r4, [r2, #-4] -; CHECK-NEXT: muls r4, r3, r4 -; CHECK-NEXT: adds r3, #4 -; CHECK-NEXT: adds.w r12, r4, #-2147483648 -; CHECK-NEXT: asr.w r5, r4, #31 -; CHECK-NEXT: ldr r4, [r6] -; CHECK-NEXT: adc r5, r5, #0 -; CHECK-NEXT: mul r2, r4, r0 +; CHECK-NEXT: ldr r1, [r9, #-4] +; CHECK-NEXT: add.w r9, r9, #4 +; CHECK-NEXT: muls r1, r3, r1 +; CHECK-NEXT: adds.w r2, r1, #-2147483648 +; CHECK-NEXT: asr.w r4, r1, #31 +; CHECK-NEXT: ldr.w r1, [r12] +; CHECK-NEXT: adc r3, r4, #0 +; CHECK-NEXT: mul r4, r1, r0 ; CHECK-NEXT: adds r0, #4 -; CHECK-NEXT: add.w r2, r2, #-2147483648 -; CHECK-NEXT: asrl r12, r5, r2 -; CHECK-NEXT: smull r2, r5, r4, r12 -; CHECK-NEXT: lsll r2, r5, #30 -; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: asr.w r11, r5, #31 -; CHECK-NEXT: mov r12, r5 -; CHECK-NEXT: lsll r12, r11, r4 -; CHECK-NEXT: mul r2, r2, r9 -; CHECK-NEXT: lsrl r12, r11, #2 -; CHECK-NEXT: adds r2, #2 -; CHECK-NEXT: lsll r12, r11, r2 +; CHECK-NEXT: add.w r4, r4, #-2147483648 +; CHECK-NEXT: asrl r2, r3, r4 +; CHECK-NEXT: smull r2, r3, r1, r2 +; CHECK-NEXT: lsll r2, r3, #30 +; CHECK-NEXT: asr.w r11, r3, #31 +; CHECK-NEXT: mov r4, r3 +; CHECK-NEXT: ldrd r3, r2, [sp, #4] @ 8-byte Folded Reload +; CHECK-NEXT: lsll r4, r11, r1 +; CHECK-NEXT: lsrl r4, r11, #2 +; CHECK-NEXT: muls r3, r2, r3 +; CHECK-NEXT: adds r3, #2 +; CHECK-NEXT: lsll r4, r11, r3 +; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: add.w r2, r4, #-2147483648 +; CHECK-NEXT: asrl r6, r5, r2 +; CHECK-NEXT: movs r2, #2 +; CHECK-NEXT: lsrl r6, r5, #2 +; CHECK-NEXT: adds r3, #4 +; CHECK-NEXT: str r6, [r2] +; CHECK-NEXT: ldr r2, [r10], #-4 +; CHECK-NEXT: mls r4, r2, r1, r8 ; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload -; CHECK-NEXT: add.w r5, r12, #-2147483648 -; CHECK-NEXT: asrl r10, r1, r5 -; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: lsrl r10, r1, #2 -; CHECK-NEXT: movs r1, #2 -; CHECK-NEXT: mov r9, r10 -; CHECK-NEXT: str.w r10, [r1] -; CHECK-NEXT: ldr r1, [r8], #-4 -; CHECK-NEXT: mls r5, r1, r4, r5 -; CHECK-NEXT: adds.w r4, r5, #-2147483648 -; CHECK-NEXT: asr.w r1, r5, #31 +; CHECK-NEXT: adds.w r8, r4, #-2147483648 +; CHECK-NEXT: asr.w r1, r4, #31 ; CHECK-NEXT: adc r1, r1, #0 -; CHECK-NEXT: lsrl r4, r1, #2 -; CHECK-NEXT: rsbs r1, r4, #0 +; CHECK-NEXT: lsrl r8, r1, #2 +; CHECK-NEXT: rsb.w r1, r8, #0 ; CHECK-NEXT: str r1, [r2] -; CHECK-NEXT: str r1, [r6, #-4] -; CHECK-NEXT: adds r6, #4 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: adds r1, #4 +; CHECK-NEXT: str r1, [r12, #-4] +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: le lr, .LBB2_2 ; CHECK-NEXT: .LBB2_3: @ %while.end ; CHECK-NEXT: add sp, #16 diff --git a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll index d076cb00ad7e0..edbbbf25aab0a 100644 --- a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll +++ b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll @@ -355,8 +355,8 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: mov lr, r0 ; CHECK-NEXT: subs r0, #1 ; CHECK-NEXT: sbcs r0, r1, #0 @@ -375,7 +375,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: movw r2, #43691 ; CHECK-NEXT: adds r1, #2 ; CHECK-NEXT: movt r2, #43690 -; CHECK-NEXT: ldr r6, [sp, #128] +; CHECK-NEXT: ldr r6, [sp, #120] ; CHECK-NEXT: movw r8, :lower16:c ; CHECK-NEXT: umull r1, r2, r1, r2 ; CHECK-NEXT: movt r8, :upper16:c @@ -384,7 +384,6 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: @ implicit-def: $r5 ; CHECK-NEXT: @ implicit-def: $r11 ; CHECK-NEXT: mov.w r9, #12 -; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: add.w r1, r1, r2, lsr #1 ; CHECK-NEXT: add.w r0, r0, r2, lsr #1 ; CHECK-NEXT: bic r3, r1, #3 @@ -395,7 +394,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: vdup.32 q6, r0 ; CHECK-NEXT: vadd.i32 q4, q0, r7 ; CHECK-NEXT: vdup.32 q7, r0 -; CHECK-NEXT: strd r3, r7, [sp, #4] @ 8-byte Folded Spill +; CHECK-NEXT: strd r3, r7, [sp] @ 8-byte Folded Spill ; CHECK-NEXT: b .LBB1_6 ; CHECK-NEXT: .LBB1_2: @ %for.body6.preheader ; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1 @@ -444,21 +443,19 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: bhi .LBB1_17 ; CHECK-NEXT: @ %bb.8: @ %for.body6.us.preheader ; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1 -; CHECK-NEXT: ldrd r2, r3, [sp, #120] +; CHECK-NEXT: ldrd r2, r3, [sp, #112] ; CHECK-NEXT: movs r0, #32 ; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: mov r4, r6 ; CHECK-NEXT: mov r7, r12 ; CHECK-NEXT: mov r6, lr ; CHECK-NEXT: bl __aeabi_ldivmod ; CHECK-NEXT: mov lr, r6 -; CHECK-NEXT: mov r6, r4 ; CHECK-NEXT: mov r12, r7 -; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: ldr r4, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r3, [sp] @ 4-byte Reload ; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r6, [sp, #120] ; CHECK-NEXT: mov r0, r11 +; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: b .LBB1_10 ; CHECK-NEXT: .LBB1_9: @ %for.cond.cleanup17.us ; CHECK-NEXT: @ in Loop: Header=BB1_10 Depth=2 @@ -573,7 +570,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: b .LBB1_27 ; CHECK-NEXT: .LBB1_28: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll index 28166e455aba2..4c0ded4515b65 100644 --- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -999,7 +999,7 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: ldrh r6, [r0] ; CHECK-NEXT: movs r5, #1 -; CHECK-NEXT: ldrd r4, r10, [r0, #4] +; CHECK-NEXT: ldrd r4, r9, [r0, #4] ; CHECK-NEXT: sub.w r0, r6, #8 ; CHECK-NEXT: add.w r3, r0, r0, lsr #29 ; CHECK-NEXT: and r0, r0, #7 @@ -1008,10 +1008,11 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-NEXT: it gt ; CHECK-NEXT: asrgt r5, r3, #3 ; CHECK-NEXT: add.w r3, r4, r6, lsl #2 -; CHECK-NEXT: sub.w r9, r3, #4 +; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: str r3, [sp, #28] @ 4-byte Spill ; CHECK-NEXT: rsbs r3, r6, #0 ; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: add.w r3, r10, #32 +; CHECK-NEXT: add.w r3, r9, #32 ; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: str r6, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill @@ -1024,8 +1025,7 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-NEXT: b .LBB16_5 ; CHECK-NEXT: .LBB16_4: @ %for.end ; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 -; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: ldrd r0, r9, [sp, #20] @ 8-byte Folded Reload +; CHECK-NEXT: ldrd r0, r1, [sp, #20] @ 8-byte Folded Reload ; CHECK-NEXT: wls lr, r0, .LBB16_5 ; CHECK-NEXT: b .LBB16_10 ; CHECK-NEXT: .LBB16_5: @ %while.end @@ -1040,15 +1040,17 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB16_8 Depth 2 ; CHECK-NEXT: @ Child Loop BB16_11 Depth 2 -; CHECK-NEXT: add.w lr, r10, #8 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 -; CHECK-NEXT: ldrd r3, r7, [r10] -; CHECK-NEXT: ldm.w lr, {r0, r5, r6, lr} -; CHECK-NEXT: ldrd r11, r8, [r10, #24] -; CHECK-NEXT: vstrb.8 q0, [r9], #16 +; CHECK-NEXT: add.w lr, r9, #8 +; CHECK-NEXT: ldrd r3, r7, [r9] +; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: ldm.w lr, {r0, r5, lr} +; CHECK-NEXT: ldrd r10, r11, [r9, #20] +; CHECK-NEXT: ldr.w r8, [r9, #28] +; CHECK-NEXT: vstrb.8 q0, [r1], #16 ; CHECK-NEXT: vldrw.u32 q0, [r4], #32 ; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill -; CHECK-NEXT: str.w r9, [sp, #24] @ 4-byte Spill ; CHECK-NEXT: vldrw.u32 q1, [r4, #-28] ; CHECK-NEXT: vmul.f32 q0, q0, r3 ; CHECK-NEXT: vldrw.u32 q6, [r4, #-24] @@ -1059,9 +1061,9 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-NEXT: vldrw.u32 q2, [r4, #-12] ; CHECK-NEXT: vfma.f32 q0, q4, r5 ; CHECK-NEXT: vldrw.u32 q3, [r4, #-8] -; CHECK-NEXT: vfma.f32 q0, q5, r6 +; CHECK-NEXT: vfma.f32 q0, q5, lr ; CHECK-NEXT: vldrw.u32 q1, [r4, #-4] -; CHECK-NEXT: vfma.f32 q0, q2, lr +; CHECK-NEXT: vfma.f32 q0, q2, r10 ; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: vfma.f32 q0, q3, r11 ; CHECK-NEXT: vfma.f32 q0, q1, r8 @@ -1075,25 +1077,26 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-NEXT: .LBB16_8: @ %for.body ; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldm.w r7, {r0, r3, r5, r6, r8, r11} +; CHECK-NEXT: ldm.w r7, {r0, r3, r5, r6} ; CHECK-NEXT: vldrw.u32 q1, [r4], #32 +; CHECK-NEXT: add.w r11, r7, #16 ; CHECK-NEXT: vldrw.u32 q6, [r4, #-24] ; CHECK-NEXT: vldrw.u32 q4, [r4, #-20] ; CHECK-NEXT: vfma.f32 q0, q1, r0 ; CHECK-NEXT: vldrw.u32 q1, [r4, #-28] +; CHECK-NEXT: ldm.w r11, {r1, r8, r10, r11} ; CHECK-NEXT: vldrw.u32 q5, [r4, #-16] -; CHECK-NEXT: vldrw.u32 q2, [r4, #-12] ; CHECK-NEXT: vfma.f32 q0, q1, r3 -; CHECK-NEXT: ldrd r9, r1, [r7, #24] +; CHECK-NEXT: vldrw.u32 q2, [r4, #-12] ; CHECK-NEXT: vfma.f32 q0, q6, r5 ; CHECK-NEXT: vldrw.u32 q3, [r4, #-8] ; CHECK-NEXT: vfma.f32 q0, q4, r6 ; CHECK-NEXT: vldrw.u32 q1, [r4, #-4] -; CHECK-NEXT: vfma.f32 q0, q5, r8 +; CHECK-NEXT: vfma.f32 q0, q5, r1 ; CHECK-NEXT: adds r7, #32 -; CHECK-NEXT: vfma.f32 q0, q2, r11 -; CHECK-NEXT: vfma.f32 q0, q3, r9 -; CHECK-NEXT: vfma.f32 q0, q1, r1 +; CHECK-NEXT: vfma.f32 q0, q2, r8 +; CHECK-NEXT: vfma.f32 q0, q3, r10 +; CHECK-NEXT: vfma.f32 q0, q1, r11 ; CHECK-NEXT: le lr, .LBB16_8 ; CHECK-NEXT: b .LBB16_4 ; CHECK-NEXT: .LBB16_9: @ in Loop: Header=BB16_6 Depth=1 diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll index 652d25af02e7c..8fe310bd3d5e3 100644 --- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll +++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll @@ -180,15 +180,15 @@ define void @correlate(ptr nocapture noundef readonly %ID, ptr nocapture noundef ; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1 ; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-NEXT: add.w r2, r9, r10 -; CHECK-NEXT: add.w r7, r1, r9, lsl #1 +; CHECK-NEXT: add.w r5, r1, r9, lsl #1 ; CHECK-NEXT: add.w r2, r1, r2, lsl #1 -; CHECK-NEXT: sub.w r5, r8, r9 -; CHECK-NEXT: dlstp.32 lr, r5 +; CHECK-NEXT: sub.w r7, r8, r9 +; CHECK-NEXT: dlstp.32 lr, r7 ; CHECK-NEXT: .LBB4_11: @ %vec.epilog.vector.body ; CHECK-NEXT: @ Parent Loop BB4_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: rsbs r4, r3, #0 -; CHECK-NEXT: vldrh.s32 q0, [r7], #8 +; CHECK-NEXT: vldrh.s32 q0, [r5], #8 ; CHECK-NEXT: vldrh.s32 q1, [r2], #8 ; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: vshl.s32 q0, r4 diff --git a/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll b/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll index da59cb259db61..22deb23cad27e 100644 --- a/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll +++ b/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll @@ -548,44 +548,43 @@ define i32 @reverted(i1 zeroext %b) { ; CHECK-NEXT: letp lr, .LBB19_1 ; CHECK-NEXT: .LBB19_2: @ %entry ; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: movw r6, :lower16:arr_20 -; CHECK-NEXT: movt r6, :upper16:arr_20 -; CHECK-NEXT: add.w r3, r6, #80 +; CHECK-NEXT: movw r11, :lower16:arr_20 +; CHECK-NEXT: adr r6, .LCPI19_0 +; CHECK-NEXT: movt r11, :upper16:arr_20 ; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: movw r0, :lower16:arr_21 ; CHECK-NEXT: movt r0, :upper16:arr_21 ; CHECK-NEXT: add.w r5, r0, #36 -; CHECK-NEXT: add.w r11, r6, #128 -; CHECK-NEXT: add.w r7, r6, #112 -; CHECK-NEXT: add.w r2, r6, #96 -; CHECK-NEXT: add.w r4, r6, #64 -; CHECK-NEXT: add.w r0, r6, #48 -; CHECK-NEXT: add.w r1, r6, #32 -; CHECK-NEXT: add.w r12, r6, #16 -; CHECK-NEXT: adr r6, .LCPI19_0 -; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: movw r6, :lower16:arr_20 +; CHECK-NEXT: add.w r3, r11, #80 +; CHECK-NEXT: add.w r9, r11, #128 +; CHECK-NEXT: add.w r7, r11, #112 +; CHECK-NEXT: add.w r2, r11, #96 +; CHECK-NEXT: add.w r4, r11, #64 +; CHECK-NEXT: add.w r0, r11, #48 +; CHECK-NEXT: add.w r1, r11, #32 +; CHECK-NEXT: add.w r12, r11, #16 ; CHECK-NEXT: mov.w r8, #327685 -; CHECK-NEXT: mov.w r9, #5 +; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: mov r6, r8 +; CHECK-NEXT: mov.w r10, #5 ; CHECK-NEXT: vmov.i16 q1, #0x5 -; CHECK-NEXT: mov.w r10, #0 -; CHECK-NEXT: movt r6, :upper16:arr_20 +; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: .LBB19_3: @ %for.cond8.preheader ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str r8, [r5, #-4] +; CHECK-NEXT: str r6, [r5, #-4] ; CHECK-NEXT: vstrh.16 q1, [r5, #-36] -; CHECK-NEXT: strh.w r9, [r5] +; CHECK-NEXT: strh.w r10, [r5] ; CHECK-NEXT: vstrh.16 q1, [r5, #-20] ; CHECK-NEXT: vstrw.32 q0, [r3] ; CHECK-NEXT: vstrh.16 q0, [r12], #152 -; CHECK-NEXT: vstrh.16 q0, [r6], #152 +; CHECK-NEXT: vstrh.16 q0, [r11], #152 ; CHECK-NEXT: vstrh.16 q0, [r1], #152 ; CHECK-NEXT: vstrh.16 q0, [r0], #152 ; CHECK-NEXT: vstrh.16 q0, [r4], #152 ; CHECK-NEXT: vstrh.16 q0, [r2], #152 ; CHECK-NEXT: vstrh.16 q0, [r7], #152 -; CHECK-NEXT: vstrh.16 q0, [r11], #152 -; CHECK-NEXT: strd r9, r10, [r3, #64] +; CHECK-NEXT: vstrh.16 q0, [r9], #152 +; CHECK-NEXT: strd r10, r8, [r3, #64] ; CHECK-NEXT: adds r5, #38 ; CHECK-NEXT: adds r3, #152 ; CHECK-NEXT: le lr, .LBB19_3 @@ -601,46 +600,46 @@ define i32 @reverted(i1 zeroext %b) { ; CHECK-NEXT: vstrb.8 q1, [r0], #16 ; CHECK-NEXT: letp lr, .LBB19_5 ; CHECK-NEXT: .LBB19_6: @ %for.cond.cleanup6 -; CHECK-NEXT: movw r6, :lower16:arr_20 +; CHECK-NEXT: movw r2, :lower16:arr_20 ; CHECK-NEXT: movw r0, #7376 -; CHECK-NEXT: movt r6, :upper16:arr_20 -; CHECK-NEXT: adds r3, r6, r0 +; CHECK-NEXT: movt r2, :upper16:arr_20 +; CHECK-NEXT: adds r3, r2, r0 ; CHECK-NEXT: movw r0, #7408 -; CHECK-NEXT: add.w r12, r6, r0 +; CHECK-NEXT: add.w r12, r2, r0 ; CHECK-NEXT: movw r0, #7344 -; CHECK-NEXT: add.w r9, r6, r0 +; CHECK-NEXT: add.w r11, r2, r0 ; CHECK-NEXT: movw r0, #7312 -; CHECK-NEXT: adds r2, r6, r0 +; CHECK-NEXT: add.w r9, r2, r0 ; CHECK-NEXT: movw r0, :lower16:arr_21 -; CHECK-NEXT: add.w r1, r6, #7424 -; CHECK-NEXT: add.w r7, r6, #7392 -; CHECK-NEXT: add.w r4, r6, #7360 -; CHECK-NEXT: add.w r5, r6, #7328 -; CHECK-NEXT: add.w r8, r6, #7296 -; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: add.w r1, r2, #7424 +; CHECK-NEXT: add.w r7, r2, #7392 +; CHECK-NEXT: add.w r4, r2, #7360 +; CHECK-NEXT: add.w r5, r2, #7328 +; CHECK-NEXT: add.w r6, r2, #7296 +; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: movt r0, :upper16:arr_21 ; CHECK-NEXT: addw r0, r0, #1860 ; CHECK-NEXT: mov.w r10, #5 -; CHECK-NEXT: dls lr, r6 -; CHECK-NEXT: mov.w r6, #327685 +; CHECK-NEXT: dls lr, r2 +; CHECK-NEXT: mov.w r2, #327685 ; CHECK-NEXT: vmov.i16 q1, #0x5 -; CHECK-NEXT: mov.w r11, #0 +; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: .LBB19_7: @ %for.cond8.preheader.1 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str r6, [r0, #-4] +; CHECK-NEXT: str r2, [r0, #-4] ; CHECK-NEXT: vstrh.16 q1, [r0, #-36] ; CHECK-NEXT: strh.w r10, [r0] ; CHECK-NEXT: vstrh.16 q1, [r0, #-20] ; CHECK-NEXT: vstrw.32 q0, [r3] -; CHECK-NEXT: vstrh.16 q0, [r2], #152 -; CHECK-NEXT: vstrh.16 q0, [r8], #152 -; CHECK-NEXT: vstrh.16 q0, [r5], #152 ; CHECK-NEXT: vstrh.16 q0, [r9], #152 +; CHECK-NEXT: vstrh.16 q0, [r6], #152 +; CHECK-NEXT: vstrh.16 q0, [r5], #152 +; CHECK-NEXT: vstrh.16 q0, [r11], #152 ; CHECK-NEXT: vstrh.16 q0, [r4], #152 ; CHECK-NEXT: vstrh.16 q0, [r7], #152 ; CHECK-NEXT: vstrh.16 q0, [r12], #152 ; CHECK-NEXT: vstrh.16 q0, [r1], #152 -; CHECK-NEXT: strd r10, r11, [r3, #64] +; CHECK-NEXT: strd r10, r8, [r3, #64] ; CHECK-NEXT: adds r0, #38 ; CHECK-NEXT: adds r3, #152 ; CHECK-NEXT: le lr, .LBB19_7 @@ -663,7 +662,7 @@ define i32 @reverted(i1 zeroext %b) { ; CHECK-NEXT: movw r0, #14704 ; CHECK-NEXT: add.w r12, r7, r0 ; CHECK-NEXT: movw r0, #14688 -; CHECK-NEXT: add.w r8, r7, r0 +; CHECK-NEXT: add.w r11, r7, r0 ; CHECK-NEXT: movw r0, #14640 ; CHECK-NEXT: add.w r9, r7, r0 ; CHECK-NEXT: movw r0, #14624 @@ -681,7 +680,7 @@ define i32 @reverted(i1 zeroext %b) { ; CHECK-NEXT: dls lr, r7 ; CHECK-NEXT: mov.w r7, #327685 ; CHECK-NEXT: vmov.i16 q1, #0x5 -; CHECK-NEXT: mov.w r11, #0 +; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: .LBB19_11: @ %for.cond8.preheader.2 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: str r7, [r1, #-4] @@ -694,10 +693,10 @@ define i32 @reverted(i1 zeroext %b) { ; CHECK-NEXT: vstrh.16 q0, [r2], #152 ; CHECK-NEXT: vstrh.16 q0, [r9], #152 ; CHECK-NEXT: vstrh.16 q0, [r5], #152 -; CHECK-NEXT: vstrh.16 q0, [r8], #152 +; CHECK-NEXT: vstrh.16 q0, [r11], #152 ; CHECK-NEXT: vstrh.16 q0, [r12], #152 ; CHECK-NEXT: vstrh.16 q0, [r4], #152 -; CHECK-NEXT: strd r10, r11, [r3, #64] +; CHECK-NEXT: strd r10, r8, [r3, #64] ; CHECK-NEXT: adds r1, #38 ; CHECK-NEXT: adds r3, #152 ; CHECK-NEXT: le lr, .LBB19_11 @@ -721,9 +720,9 @@ define i32 @reverted(i1 zeroext %b) { ; CHECK-NEXT: movt r7, :upper16:arr_20 ; CHECK-NEXT: add.w r12, r7, r1 ; CHECK-NEXT: movw r1, #21984 -; CHECK-NEXT: add.w r8, r7, r1 +; CHECK-NEXT: add.w r10, r7, r1 ; CHECK-NEXT: movw r1, #21952 -; CHECK-NEXT: add.w r9, r7, r1 +; CHECK-NEXT: add.w r8, r7, r1 ; CHECK-NEXT: movw r1, #21936 ; CHECK-NEXT: movw r0, #21968 ; CHECK-NEXT: adds r5, r7, r1 @@ -735,7 +734,7 @@ define i32 @reverted(i1 zeroext %b) { ; CHECK-NEXT: add.w r3, r7, #22016 ; CHECK-NEXT: add.w r6, r7, #21888 ; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: mov.w r10, #5 +; CHECK-NEXT: mov.w r9, #5 ; CHECK-NEXT: vmov.i16 q1, #0x5 ; CHECK-NEXT: mov.w r11, #0 ; CHECK-NEXT: dls lr, r7 @@ -744,18 +743,18 @@ define i32 @reverted(i1 zeroext %b) { ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: str r7, [r2, #-4] ; CHECK-NEXT: vstrh.16 q1, [r2, #-36] -; CHECK-NEXT: strh.w r10, [r2] +; CHECK-NEXT: strh.w r9, [r2] ; CHECK-NEXT: vstrh.16 q1, [r2, #-20] ; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: vstrh.16 q0, [r4], #152 ; CHECK-NEXT: vstrh.16 q0, [r6], #152 ; CHECK-NEXT: vstrh.16 q0, [r1], #152 ; CHECK-NEXT: vstrh.16 q0, [r5], #152 -; CHECK-NEXT: vstrh.16 q0, [r9], #152 ; CHECK-NEXT: vstrh.16 q0, [r8], #152 +; CHECK-NEXT: vstrh.16 q0, [r10], #152 ; CHECK-NEXT: vstrh.16 q0, [r12], #152 ; CHECK-NEXT: vstrh.16 q0, [r3], #152 -; CHECK-NEXT: strd r10, r11, [r0, #64] +; CHECK-NEXT: strd r9, r11, [r0, #64] ; CHECK-NEXT: adds r2, #38 ; CHECK-NEXT: adds r0, #152 ; CHECK-NEXT: le lr, .LBB19_15 diff --git a/llvm/test/CodeGen/Thumb2/mve-phireg.ll b/llvm/test/CodeGen/Thumb2/mve-phireg.ll index dad856c0677a1..14ea3a3713224 100644 --- a/llvm/test/CodeGen/Thumb2/mve-phireg.ll +++ b/llvm/test/CodeGen/Thumb2/mve-phireg.ll @@ -14,8 +14,8 @@ define arm_aapcs_vfpcc void @k() { ; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: adr r5, .LCPI0_0 ; CHECK-NEXT: adr r4, .LCPI0_1 -; CHECK-NEXT: vldrw.u32 q6, [r5] -; CHECK-NEXT: vldrw.u32 q5, [r4] +; CHECK-NEXT: vldrw.u32 q5, [r5] +; CHECK-NEXT: vldrw.u32 q6, [r4] ; CHECK-NEXT: add r0, sp, #16 ; CHECK-NEXT: vmov.i32 q0, #0x1 ; CHECK-NEXT: vmov.i8 q1, #0x0 @@ -25,14 +25,14 @@ define arm_aapcs_vfpcc void @k() { ; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vand q5, q5, q0 ; CHECK-NEXT: vand q6, q6, q0 -; CHECK-NEXT: vcmp.i32 eq, q5, zr -; CHECK-NEXT: vpsel q5, q2, q1 +; CHECK-NEXT: vand q5, q5, q0 ; CHECK-NEXT: vcmp.i32 eq, q6, zr ; CHECK-NEXT: vpsel q6, q2, q1 -; CHECK-NEXT: vstrh.32 q5, [r0] -; CHECK-NEXT: vstrh.32 q6, [r0, #8] +; CHECK-NEXT: vcmp.i32 eq, q5, zr +; CHECK-NEXT: vpsel q5, q2, q1 +; CHECK-NEXT: vstrh.32 q6, [r0] +; CHECK-NEXT: vstrh.32 q5, [r0, #8] ; CHECK-NEXT: vldrw.u32 q5, [r0] ; CHECK-NEXT: vcmp.i16 ne, q5, zr ; CHECK-NEXT: vmov.i32 q5, #0x0 diff --git a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll index 43ed5eefbf4c7..ff5a27149cb2e 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll @@ -17,16 +17,16 @@ define void @arm_cmplx_dot_prod_q15(ptr noundef %pSrcA, ptr noundef %pSrcB, i32 ; CHECK-NEXT: mov.w r5, #0 ; CHECK-NEXT: csel r7, r6, r5, hs ; CHECK-NEXT: add.w lr, r7, #1 -; CHECK-NEXT: mov r4, r5 +; CHECK-NEXT: mov r8, r5 ; CHECK-NEXT: vldrh.u16 q0, [r0], #32 ; CHECK-NEXT: movs r7, #0 -; CHECK-NEXT: mov r8, r5 +; CHECK-NEXT: mov r6, r5 ; CHECK-NEXT: vldrh.u16 q1, [r1], #32 -; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q1 +; CHECK-NEXT: vmlsldava.s16 r8, r7, q0, q1 ; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q1 +; CHECK-NEXT: vmlaldavax.s16 r6, r5, q0, q1 ; CHECK-NEXT: vldrh.u16 q3, [r1, #-16] -; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q3 +; CHECK-NEXT: vmlsldava.s16 r8, r7, q2, q3 ; CHECK-NEXT: vldrh.u16 q0, [r1], #32 ; CHECK-NEXT: sub.w lr, lr, #1 ; CHECK-NEXT: cmp.w lr, #0 @@ -35,75 +35,72 @@ define void @arm_cmplx_dot_prod_q15(ptr noundef %pSrcA, ptr noundef %pSrcB, i32 ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB0_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q3 +; CHECK-NEXT: vmlaldavax.s16 r6, r5, q2, q3 ; CHECK-NEXT: vldrh.u16 q3, [r1, #-16] -; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q0 +; CHECK-NEXT: vmlsldava.s16 r8, r7, q1, q0 ; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q0 +; CHECK-NEXT: vmlaldavax.s16 r6, r5, q1, q0 ; CHECK-NEXT: vldrh.u16 q1, [r0], #32 -; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q3 +; CHECK-NEXT: vmlsldava.s16 r8, r7, q2, q3 ; CHECK-NEXT: vldrh.u16 q0, [r1], #32 ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: .LBB0_3: -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q3 -; CHECK-NEXT: movs r6, #14 -; CHECK-NEXT: and.w r2, r6, r2, lsl #1 -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q0 +; CHECK-NEXT: vmlaldavax.s16 r6, r5, q2, q3 +; CHECK-NEXT: movs r4, #14 +; CHECK-NEXT: and.w r2, r4, r2, lsl #1 +; CHECK-NEXT: vmlaldavax.s16 r6, r5, q1, q0 ; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] -; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q0 +; CHECK-NEXT: vmlsldava.s16 r8, r7, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r1, #-16] -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q0 +; CHECK-NEXT: vmlaldavax.s16 r6, r5, q2, q0 ; CHECK-NEXT: vctp.16 r2 -; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q0 +; CHECK-NEXT: vmlsldava.s16 r8, r7, q2, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrht.u16 q1, [r0] ; CHECK-NEXT: cmp r2, #9 ; CHECK-NEXT: vpsttt ; CHECK-NEXT: vldrht.u16 q0, [r1] -; CHECK-NEXT: vmlsldavat.s16 r4, r7, q1, q0 -; CHECK-NEXT: vmlaldavaxt.s16 r8, r5, q1, q0 -; CHECK-NEXT: blo .LBB0_10 +; CHECK-NEXT: vmlsldavat.s16 r8, r7, q1, q0 +; CHECK-NEXT: vmlaldavaxt.s16 r6, r5, q1, q0 +; CHECK-NEXT: blo .LBB0_9 ; CHECK-NEXT: @ %bb.4: @ %do.body.1 ; CHECK-NEXT: subs r2, #8 ; CHECK-NEXT: vctp.16 r2 ; CHECK-NEXT: vpstttt ; CHECK-NEXT: vldrht.u16 q0, [r0, #16] ; CHECK-NEXT: vldrht.u16 q1, [r1, #16] -; CHECK-NEXT: vmlsldavat.s16 r4, r7, q0, q1 -; CHECK-NEXT: vmlaldavaxt.s16 r8, r5, q0, q1 -; CHECK-NEXT: b .LBB0_10 +; CHECK-NEXT: vmlsldavat.s16 r8, r7, q0, q1 +; CHECK-NEXT: vmlaldavaxt.s16 r6, r5, q0, q1 +; CHECK-NEXT: b .LBB0_9 ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB0_5: @ %if.else -; CHECK-NEXT: mov.w r4, #0 -; CHECK-NEXT: cbz r2, .LBB0_9 +; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: cbz r2, .LBB0_8 ; CHECK-NEXT: @ %bb.6: @ %while.body14.preheader -; CHECK-NEXT: lsls r6, r2, #1 -; CHECK-NEXT: mov r5, r4 -; CHECK-NEXT: mov r7, r4 -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: dlstp.16 lr, r6 +; CHECK-NEXT: lsls r4, r2, #1 +; CHECK-NEXT: mov r5, r8 +; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: mov r7, r8 +; CHECK-NEXT: dlstp.16 lr, r4 ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB0_7: @ %while.body14 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16 -; CHECK-NEXT: vmlsldava.s16 r2, r7, q0, q1 -; CHECK-NEXT: vmlaldavax.s16 r4, r5, q0, q1 +; CHECK-NEXT: vmlsldava.s16 r8, r7, q0, q1 +; CHECK-NEXT: vmlaldavax.s16 r6, r5, q0, q1 ; CHECK-NEXT: letp lr, .LBB0_7 -; CHECK-NEXT: @ %bb.8: @ %if.end.loopexit177 -; CHECK-NEXT: mov r8, r4 -; CHECK-NEXT: mov r4, r2 -; CHECK-NEXT: b .LBB0_10 +; CHECK-NEXT: b .LBB0_9 ; CHECK-NEXT: .p2align 2 -; CHECK-NEXT: .LBB0_9: -; CHECK-NEXT: mov r7, r4 -; CHECK-NEXT: mov.w r8, #0 -; CHECK-NEXT: mov r5, r4 -; CHECK-NEXT: .LBB0_10: @ %if.end -; CHECK-NEXT: asrl r4, r7, #6 -; CHECK-NEXT: asrl r8, r5, #6 -; CHECK-NEXT: str r4, [r3] -; CHECK-NEXT: str.w r8, [r12] +; CHECK-NEXT: .LBB0_8: +; CHECK-NEXT: mov r7, r8 +; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: mov r5, r8 +; CHECK-NEXT: .LBB0_9: @ %if.end +; CHECK-NEXT: asrl r8, r7, #6 +; CHECK-NEXT: asrl r6, r5, #6 +; CHECK-NEXT: str.w r8, [r3] +; CHECK-NEXT: str.w r6, [r12] ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %cmp = icmp ugt i32 %numSamples, 15 diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll index f90af3cc5ba24..c987c4b537200 100644 --- a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll @@ -845,12 +845,12 @@ define void @DCT_mve6(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: .LBB5_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB5_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: add.w r12, r3, r5 +; CHECK-NEXT: add.w r9, r3, r5 ; CHECK-NEXT: vldrw.u32 q6, [r1], #16 ; CHECK-NEXT: vldrw.u32 q7, [r3], #16 -; CHECK-NEXT: add.w r10, r12, r5 +; CHECK-NEXT: add.w r10, r9, r5 ; CHECK-NEXT: vfma.f32 q4, q7, q6 -; CHECK-NEXT: vldrw.u32 q7, [r12] +; CHECK-NEXT: vldrw.u32 q7, [r9] ; CHECK-NEXT: add.w r6, r10, r5 ; CHECK-NEXT: vfma.f32 q5, q7, q6 ; CHECK-NEXT: vldrw.u32 q7, [r10] @@ -1093,18 +1093,16 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: vfmat.f32 q5, q0, q7 ; CHECK-NEXT: vldrwt.u32 q0, [r10] ; CHECK-NEXT: add.w r6, r11, r5 +; CHECK-NEXT: vstrw.32 q5, [sp, #40] @ 16-byte Spill ; CHECK-NEXT: vpstt ; CHECK-NEXT: vfmat.f32 q6, q0, q7 ; CHECK-NEXT: vldrwt.u32 q0, [r11] -; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill -; CHECK-NEXT: vmov q6, q5 -; CHECK-NEXT: vpst -; CHECK-NEXT: vfmat.f32 q1, q0, q7 ; CHECK-NEXT: vmov q5, q4 ; CHECK-NEXT: vmov q4, q3 -; CHECK-NEXT: vmov q3, q1 -; CHECK-NEXT: vpst +; CHECK-NEXT: vpstt +; CHECK-NEXT: vfmat.f32 q1, q0, q7 ; CHECK-NEXT: vldrwt.u32 q0, [r6] +; CHECK-NEXT: vmov q3, q1 ; CHECK-NEXT: vldrw.u32 q1, [sp, #56] @ 16-byte Reload ; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vpstt @@ -1122,8 +1120,7 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: vpstt ; CHECK-NEXT: vfmat.f32 q4, q0, q7 ; CHECK-NEXT: vldrwt.u32 q0, [r7] -; CHECK-NEXT: vmov q5, q6 -; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q5, [sp, #40] @ 16-byte Reload ; CHECK-NEXT: vpst ; CHECK-NEXT: vfmat.f32 q2, q0, q7 ; CHECK-NEXT: le lr, .LBB6_3 diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll index 096d4382d2c35..bd0e5dabea3cf 100644 --- a/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll @@ -8,23 +8,24 @@ define i32 @vaddv(ptr nocapture readonly %data, i32 %N) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: mov lr, r1 ; CHECK-NEXT: cmp r1, #1 ; CHECK-NEXT: blt .LBB0_4 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: dls lr, r1 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: .LBB0_2: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r1], #32 -; CHECK-NEXT: vaddva.s32 r0, q0 -; CHECK-NEXT: vldrw.u32 q0, [r1, #-16] -; CHECK-NEXT: vaddva.s32 r0, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0], #32 +; CHECK-NEXT: vaddva.s32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #-16] +; CHECK-NEXT: vaddva.s32 r2, q0 ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB0_4: -; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: pop {r7, pc} entry: %cmp11 = icmp sgt i32 %N, 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll index cba0f9cbba2ca..3e7ed0b096b82 100644 --- a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll @@ -212,9 +212,9 @@ define ptr @test(ptr nocapture readonly %input_row, ptr nocapture readonly %inpu ; CHECK-NEXT: beq .LBB2_8 ; CHECK-NEXT: @ %bb.2: @ %for.body.lr.ph ; CHECK-NEXT: ldr r3, [sp, #64] -; CHECK-NEXT: mov.w r9, #0 +; CHECK-NEXT: mov.w r11, #0 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldr.w r11, [sp, #56] +; CHECK-NEXT: ldr.w r9, [sp, #56] ; CHECK-NEXT: add.w r0, r1, r3, lsl #1 ; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: adds r0, r1, r3 @@ -235,15 +235,15 @@ define ptr @test(ptr nocapture readonly %input_row, ptr nocapture readonly %inpu ; CHECK-NEXT: add.w r1, r8, r10 ; CHECK-NEXT: add r1, r6 ; CHECK-NEXT: add r1, r12 -; CHECK-NEXT: strb.w r1, [r3, r9] -; CHECK-NEXT: add.w r9, r9, #1 -; CHECK-NEXT: cmp r9, r2 +; CHECK-NEXT: strb.w r1, [r3, r11] +; CHECK-NEXT: add.w r11, r11, #1 +; CHECK-NEXT: cmp r11, r2 ; CHECK-NEXT: beq .LBB2_8 ; CHECK-NEXT: .LBB2_5: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB2_7 Depth 2 ; CHECK-NEXT: ldr r1, [sp, #68] -; CHECK-NEXT: ldr.w r12, [r1, r9, lsl #2] +; CHECK-NEXT: ldr.w r12, [r1, r11, lsl #2] ; CHECK-NEXT: subs r1, r0, r0 ; CHECK-NEXT: ble .LBB2_3 ; CHECK-NEXT: @ %bb.6: @ %for.body24.preheader @@ -254,7 +254,7 @@ define ptr @test(ptr nocapture readonly %input_row, ptr nocapture readonly %inpu ; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: mov r8, r12 -; CHECK-NEXT: mla r7, r9, r7, r3 +; CHECK-NEXT: mla r7, r11, r7, r3 ; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: ldrd r4, r3, [sp] @ 8-byte Folded Reload ; CHECK-NEXT: mov r10, r12 @@ -262,17 +262,17 @@ define ptr @test(ptr nocapture readonly %input_row, ptr nocapture readonly %inpu ; CHECK-NEXT: @ Parent Loop BB2_5 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vldrb.s16 q0, [r4], #8 -; CHECK-NEXT: vadd.i16 q1, q0, r11 +; CHECK-NEXT: vadd.i16 q1, q0, r9 ; CHECK-NEXT: vldrb.s16 q0, [r7], #8 ; CHECK-NEXT: vmlava.s16 r12, q0, q1 ; CHECK-NEXT: vldrb.s16 q1, [r5], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r11 +; CHECK-NEXT: vadd.i16 q1, q1, r9 ; CHECK-NEXT: vmlava.s16 r6, q0, q1 ; CHECK-NEXT: vldrb.s16 q1, [r3], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r11 +; CHECK-NEXT: vadd.i16 q1, q1, r9 ; CHECK-NEXT: vmlava.s16 r8, q0, q1 ; CHECK-NEXT: vldrb.s16 q1, [r1], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r11 +; CHECK-NEXT: vadd.i16 q1, q1, r9 ; CHECK-NEXT: vmlava.s16 r10, q0, q1 ; CHECK-NEXT: le lr, .LBB2_7 ; CHECK-NEXT: b .LBB2_4 @@ -395,9 +395,9 @@ define ptr @test_optsize(ptr nocapture readonly %input_row, ptr nocapture readon ; CHECK-NEXT: beq .LBB3_8 ; CHECK-NEXT: @ %bb.2: @ %for.body.lr.ph ; CHECK-NEXT: ldr r3, [sp, #64] -; CHECK-NEXT: mov.w r9, #0 +; CHECK-NEXT: mov.w r11, #0 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldr.w r11, [sp, #56] +; CHECK-NEXT: ldr.w r9, [sp, #56] ; CHECK-NEXT: add.w r0, r1, r3, lsl #1 ; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: adds r0, r1, r3 @@ -411,7 +411,7 @@ define ptr @test_optsize(ptr nocapture readonly %input_row, ptr nocapture readon ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB3_5 Depth 2 ; CHECK-NEXT: ldr r1, [sp, #68] -; CHECK-NEXT: ldr.w r12, [r1, r9, lsl #2] +; CHECK-NEXT: ldr.w r12, [r1, r11, lsl #2] ; CHECK-NEXT: subs r1, r0, r0 ; CHECK-NEXT: ble .LBB3_6 ; CHECK-NEXT: @ %bb.4: @ %for.body24.preheader @@ -422,7 +422,7 @@ define ptr @test_optsize(ptr nocapture readonly %input_row, ptr nocapture readon ; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: mov r8, r12 -; CHECK-NEXT: mla r7, r9, r7, r3 +; CHECK-NEXT: mla r7, r11, r7, r3 ; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: ldrd r4, r3, [sp] @ 8-byte Folded Reload ; CHECK-NEXT: mov r10, r12 @@ -430,17 +430,17 @@ define ptr @test_optsize(ptr nocapture readonly %input_row, ptr nocapture readon ; CHECK-NEXT: @ Parent Loop BB3_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vldrb.s16 q0, [r4], #8 -; CHECK-NEXT: vadd.i16 q1, q0, r11 +; CHECK-NEXT: vadd.i16 q1, q0, r9 ; CHECK-NEXT: vldrb.s16 q0, [r7], #8 ; CHECK-NEXT: vmlava.s16 r12, q0, q1 ; CHECK-NEXT: vldrb.s16 q1, [r5], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r11 +; CHECK-NEXT: vadd.i16 q1, q1, r9 ; CHECK-NEXT: vmlava.s16 r6, q0, q1 ; CHECK-NEXT: vldrb.s16 q1, [r3], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r11 +; CHECK-NEXT: vadd.i16 q1, q1, r9 ; CHECK-NEXT: vmlava.s16 r8, q0, q1 ; CHECK-NEXT: vldrb.s16 q1, [r1], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r11 +; CHECK-NEXT: vadd.i16 q1, q1, r9 ; CHECK-NEXT: vmlava.s16 r10, q0, q1 ; CHECK-NEXT: le lr, .LBB3_5 ; CHECK-NEXT: b .LBB3_7 @@ -454,9 +454,9 @@ define ptr @test_optsize(ptr nocapture readonly %input_row, ptr nocapture readon ; CHECK-NEXT: add.w r1, r8, r10 ; CHECK-NEXT: add r1, r6 ; CHECK-NEXT: add r1, r12 -; CHECK-NEXT: strb.w r1, [r3, r9] -; CHECK-NEXT: add.w r9, r9, #1 -; CHECK-NEXT: cmp r9, r2 +; CHECK-NEXT: strb.w r1, [r3, r11] +; CHECK-NEXT: add.w r11, r11, #1 +; CHECK-NEXT: cmp r11, r2 ; CHECK-NEXT: bne .LBB3_3 ; CHECK-NEXT: .LBB3_8: @ %if.end ; CHECK-NEXT: ldr r0, [sp, #72] diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll index 29b56639bd769..16e7736886e94 100644 --- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll @@ -12,12 +12,13 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: beq.w .LBB0_8 ; CHECK-NEXT: @ %bb.1: @ %entry ; CHECK-NEXT: mov r11, r2 +; CHECK-NEXT: mov r8, r1 ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: bne .LBB0_3 ; CHECK-NEXT: @ %bb.2: ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r8, r1 +; CHECK-NEXT: mov r1, r8 ; CHECK-NEXT: mov r10, r11 ; CHECK-NEXT: b .LBB0_6 ; CHECK-NEXT: .LBB0_3: @ %vector.ph @@ -29,7 +30,7 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: add.w r10, r11, r2, lsl #2 ; CHECK-NEXT: add.w lr, r6, r7, lsr #1 ; CHECK-NEXT: str r2, [sp] @ 4-byte Spill -; CHECK-NEXT: add.w r8, r1, r2, lsl #2 +; CHECK-NEXT: add.w r1, r8, r2, lsl #2 ; CHECK-NEXT: add.w r12, r0, r2, lsl #2 ; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: vmvn.i32 q1, #0x80000000 @@ -37,7 +38,7 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrd r4, r2, [r0], #8 ; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: ldrd r7, r6, [r1], #8 +; CHECK-NEXT: ldrd r7, r6, [r8], #8 ; CHECK-NEXT: smull r4, r7, r7, r4 ; CHECK-NEXT: asrl r4, r7, #31 ; CHECK-NEXT: rsbs.w r9, r4, #-2147483648 @@ -80,22 +81,22 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: .LBB0_6: @ %for.body.preheader ; CHECK-NEXT: sub.w lr, r3, r2 ; CHECK-NEXT: mov.w r0, #-1 -; CHECK-NEXT: mov.w r1, #-2147483648 +; CHECK-NEXT: mov.w r2, #-2147483648 ; CHECK-NEXT: mvn r3, #-2147483648 ; CHECK-NEXT: .LBB0_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r2, [r12], #4 -; CHECK-NEXT: ldr r4, [r8], #4 -; CHECK-NEXT: smull r2, r5, r4, r2 -; CHECK-NEXT: asrl r2, r5, #31 -; CHECK-NEXT: subs r4, r1, r2 -; CHECK-NEXT: sbcs.w r4, r0, r5 -; CHECK-NEXT: csel r2, r2, r1, lt -; CHECK-NEXT: csel r4, r5, r0, lt -; CHECK-NEXT: subs r5, r2, r3 -; CHECK-NEXT: sbcs r4, r4, #0 -; CHECK-NEXT: csel r2, r2, r3, lt -; CHECK-NEXT: str r2, [r10], #4 +; CHECK-NEXT: ldr r4, [r12], #4 +; CHECK-NEXT: ldr r5, [r1], #4 +; CHECK-NEXT: smull r4, r5, r5, r4 +; CHECK-NEXT: asrl r4, r5, #31 +; CHECK-NEXT: subs r6, r2, r4 +; CHECK-NEXT: sbcs.w r6, r0, r5 +; CHECK-NEXT: csel r4, r4, r2, lt +; CHECK-NEXT: csel r5, r5, r0, lt +; CHECK-NEXT: subs r6, r4, r3 +; CHECK-NEXT: sbcs r5, r5, #0 +; CHECK-NEXT: csel r4, r4, r3, lt +; CHECK-NEXT: str r4, [r10], #4 ; CHECK-NEXT: le lr, .LBB0_7 ; CHECK-NEXT: .LBB0_8: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #8 diff --git a/llvm/test/CodeGen/Thumb2/pr52817.ll b/llvm/test/CodeGen/Thumb2/pr52817.ll index 87615f0a1f7ef..4cc0960e1f57f 100644 --- a/llvm/test/CodeGen/Thumb2/pr52817.ll +++ b/llvm/test/CodeGen/Thumb2/pr52817.ll @@ -18,25 +18,25 @@ define i32 @test(ptr %arg, ptr %arg1, ptr %arg2) #0 !dbg !6 { ; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: add r7, sp, #12 ; CHECK-NEXT: str r8, [sp, #-4]! -; CHECK-NEXT: mov.w lr, #0 -; CHECK-NEXT: mov.w r9, #1 -; CHECK-NEXT: movw r12, #4100 +; CHECK-NEXT: mov.w r9, #0 ; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: mov.w r12, #1 +; CHECK-NEXT: movw lr, #4100 ; CHECK-NEXT: LBB0_1: @ %bb3 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r5, r3, #1 -; CHECK-NEXT: str.w lr, [r2] -; CHECK-NEXT: cmp.w lr, #0 +; CHECK-NEXT: str.w r9, [r2] +; CHECK-NEXT: cmp.w r9, #0 ; CHECK-NEXT: add.w r4, r0, r5, lsl #2 -; CHECK-NEXT: add.w r8, r4, r12 -; CHECK-NEXT: lsl.w r4, r9, r3 +; CHECK-NEXT: add.w r8, r4, lr +; CHECK-NEXT: lsl.w r4, r12, r3 ; CHECK-NEXT: and.w r3, r3, r4 ; CHECK-NEXT: add.w r4, r1, r5, lsl #2 ; CHECK-NEXT: itte ne ; CHECK-NEXT: movne r6, #0 ; CHECK-NEXT: Ltmp0: ; CHECK-NEXT: @DEBUG_VALUE: test:this <- [DW_OP_LLVM_arg 0, DW_OP_plus_uconst 135168, DW_OP_LLVM_arg 1, DW_OP_constu 4, DW_OP_mul, DW_OP_plus, DW_OP_plus_uconst 4, DW_OP_stack_value] $r0, $r5 -; CHECK-NEXT: .loc 1 28 24 prologue_end @ test.cpp:28:24 +; CHECK-NEXT: .loc 1 28 24 prologue_end @ test.cpp:28:24 @[ test.cpp:204:23 ] ; CHECK-NEXT: strne.w r6, [r8] ; CHECK-NEXT: moveq r6, #1 ; CHECK-NEXT: ldr r4, [r4, #4] diff --git a/llvm/test/CodeGen/VE/Scalar/br_jt.ll b/llvm/test/CodeGen/VE/Scalar/br_jt.ll index fd880a7f42912..a418ef4892b33 100644 --- a/llvm/test/CodeGen/VE/Scalar/br_jt.ll +++ b/llvm/test/CodeGen/VE/Scalar/br_jt.ll @@ -641,53 +641,53 @@ define signext i32 @br_jt8_m(i32 signext %0, i32 signext %1) { ; PIC: # %bb.0: ; PIC-NEXT: st %s15, 24(, %s11) ; PIC-NEXT: st %s16, 32(, %s11) -; PIC-NEXT: and %s2, %s0, (32)0 -; PIC-NEXT: adds.w.sx %s0, -1, %s2 -; PIC-NEXT: cmpu.w %s3, 8, %s0 +; PIC-NEXT: and %s0, %s0, (32)0 +; PIC-NEXT: adds.w.sx %s3, -1, %s0 +; PIC-NEXT: cmpu.w %s2, 8, %s3 ; PIC-NEXT: lea %s15, _GLOBAL_OFFSET_TABLE_@pc_lo(-24) ; PIC-NEXT: and %s15, %s15, (32)0 ; PIC-NEXT: sic %s16 ; PIC-NEXT: lea.sl %s15, _GLOBAL_OFFSET_TABLE_@pc_hi(%s16, %s15) -; PIC-NEXT: brgt.w 0, %s3, .LBB7_9 +; PIC-NEXT: brgt.w 0, %s2, .LBB7_9 ; PIC-NEXT: # %bb.1: -; PIC-NEXT: and %s1, %s1, (32)0 -; PIC-NEXT: adds.w.zx %s0, %s0, (0)1 -; PIC-NEXT: sll %s0, %s0, 2 +; PIC-NEXT: and %s2, %s1, (32)0 +; PIC-NEXT: adds.w.zx %s1, %s3, (0)1 +; PIC-NEXT: sll %s1, %s1, 2 ; PIC-NEXT: lea %s3, .LJTI7_0@gotoff_lo ; PIC-NEXT: and %s3, %s3, (32)0 ; PIC-NEXT: lea.sl %s3, .LJTI7_0@gotoff_hi(%s3, %s15) -; PIC-NEXT: ldl.sx %s0, (%s0, %s3) +; PIC-NEXT: ldl.sx %s1, (%s1, %s3) ; PIC-NEXT: lea %s3, br_jt8_m@gotoff_lo ; PIC-NEXT: and %s3, %s3, (32)0 ; PIC-NEXT: lea.sl %s3, br_jt8_m@gotoff_hi(%s3, %s15) -; PIC-NEXT: adds.l %s3, %s3, %s0 -; PIC-NEXT: or %s0, 3, (0)1 +; PIC-NEXT: adds.l %s3, %s3, %s1 +; PIC-NEXT: or %s1, 3, (0)1 ; PIC-NEXT: b.l.t (, %s3) ; PIC-NEXT: .LBB7_2: -; PIC-NEXT: or %s0, 0, (0)1 +; PIC-NEXT: or %s1, 0, (0)1 ; PIC-NEXT: br.l.t .LBB7_10 ; PIC-NEXT: .LBB7_9: -; PIC-NEXT: or %s0, 0, %s2 +; PIC-NEXT: or %s1, 0, %s0 ; PIC-NEXT: br.l.t .LBB7_10 ; PIC-NEXT: .LBB7_6: -; PIC-NEXT: adds.w.sx %s0, -2, %s1 +; PIC-NEXT: adds.w.sx %s1, -2, %s2 ; PIC-NEXT: br.l.t .LBB7_10 ; PIC-NEXT: .LBB7_8: -; PIC-NEXT: or %s0, 11, (0)1 +; PIC-NEXT: or %s1, 11, (0)1 ; PIC-NEXT: br.l.t .LBB7_10 ; PIC-NEXT: .LBB7_7: -; PIC-NEXT: or %s0, 10, (0)1 +; PIC-NEXT: or %s1, 10, (0)1 ; PIC-NEXT: br.l.t .LBB7_10 ; PIC-NEXT: .LBB7_3: -; PIC-NEXT: or %s0, 4, (0)1 +; PIC-NEXT: or %s1, 4, (0)1 ; PIC-NEXT: br.l.t .LBB7_10 ; PIC-NEXT: .LBB7_4: -; PIC-NEXT: adds.w.sx %s0, 3, %s1 +; PIC-NEXT: adds.w.sx %s1, 3, %s2 ; PIC-NEXT: br.l.t .LBB7_10 ; PIC-NEXT: .LBB7_5: -; PIC-NEXT: adds.w.sx %s0, -5, %s1 +; PIC-NEXT: adds.w.sx %s1, -5, %s2 ; PIC-NEXT: .LBB7_10: -; PIC-NEXT: adds.w.sx %s0, %s0, (0)1 +; PIC-NEXT: adds.w.sx %s0, %s1, (0)1 ; PIC-NEXT: ld %s16, 32(, %s11) ; PIC-NEXT: ld %s15, 24(, %s11) ; PIC-NEXT: b.l.t (, %s10) diff --git a/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll b/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll index 1962ddebc2115..99338d8f063f5 100644 --- a/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll +++ b/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll @@ -126,14 +126,14 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr ; CHECK-NEXT: decl {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; CHECK-NEXT: jmp LBB0_8 ; CHECK-NEXT: LBB0_18: ## %bb43 -; CHECK-NEXT: Ltmp5: ; CHECK-NEXT: movl %esi, %ebx +; CHECK-NEXT: Ltmp5: ; CHECK-NEXT: calll _OnOverFlow ; CHECK-NEXT: Ltmp6: ; CHECK-NEXT: jmp LBB0_3 ; CHECK-NEXT: LBB0_2: ## %bb29 -; CHECK-NEXT: Ltmp7: ; CHECK-NEXT: movl %esi, %ebx +; CHECK-NEXT: Ltmp7: ; CHECK-NEXT: calll _OnOverFlow ; CHECK-NEXT: Ltmp8: ; CHECK-NEXT: LBB0_3: ## %bb30 diff --git a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll index 06cf968512db8..8a8e7a3b4df2c 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll @@ -297,30 +297,30 @@ define dso_local void @test6(i16 signext %0) nounwind { ; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: movl $buf, %ecx -; CHECK-NEXT: movl $32, %edx -; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: movl $buf, %edx +; CHECK-NEXT: movl $32, %esi ; CHECK-NEXT: jmp .LBB5_1 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB5_3: # %if.false ; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1 -; CHECK-NEXT: decl %esi +; CHECK-NEXT: decl %eax ; CHECK-NEXT: .LBB5_4: # %loop.bb2 ; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1 -; CHECK-NEXT: leal (%rdi,%rsi), %r8d +; CHECK-NEXT: leal (%rdi,%rax), %r8d ; CHECK-NEXT: movw %r8w, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: cmpw $7, %si +; CHECK-NEXT: cmpw $7, %ax ; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) ; CHECK-NEXT: tilezero %tmm0 -; CHECK-NEXT: tilestored %tmm0, (%rcx,%rdx) +; CHECK-NEXT: tilestored %tmm0, (%rdx,%rsi) ; CHECK-NEXT: jne .LBB5_5 ; CHECK-NEXT: .LBB5_1: # %loop.bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: testb %cl, %cl ; CHECK-NEXT: jne .LBB5_3 ; CHECK-NEXT: # %bb.2: # %if.true ; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1 -; CHECK-NEXT: incl %esi +; CHECK-NEXT: incl %eax ; CHECK-NEXT: jmp .LBB5_4 ; CHECK-NEXT: .LBB5_5: # %exit ; CHECK-NEXT: tilerelease diff --git a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll index 254f824379d56..944319d2dc373 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll @@ -242,25 +242,25 @@ define dso_local void @test3(ptr%buf) nounwind { ; CHECK-NEXT: jne .LBB1_3 ; CHECK-NEXT: # %bb.1: # %loop.header.preheader ; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: movl $32, %r14d -; CHECK-NEXT: xorl %r15d, %r15d +; CHECK-NEXT: xorl %r14d, %r14d +; CHECK-NEXT: movl $32, %r15d ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB1_2: # %loop.header ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: tilestored %tmm0, (%rbx,%r14) +; CHECK-NEXT: tilestored %tmm0, (%rbx,%r15) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: tilezero %tmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: tilezero %tmm0 -; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm1 -; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm2 +; CHECK-NEXT: tileloadd (%rbx,%r15), %tmm1 +; CHECK-NEXT: tileloadd (%rbx,%r15), %tmm2 ; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 -; CHECK-NEXT: tilestored %tmm0, (%rbx,%r14) +; CHECK-NEXT: tilestored %tmm0, (%rbx,%r15) ; CHECK-NEXT: tilezero %tmm0 -; CHECK-NEXT: incl %r15d -; CHECK-NEXT: cmpw $100, %r15w +; CHECK-NEXT: incl %r14d +; CHECK-NEXT: cmpw $100, %r14w ; CHECK-NEXT: jl .LBB1_2 ; CHECK-NEXT: .LBB1_3: # %exit ; CHECK-NEXT: addq $72, %rsp @@ -297,12 +297,12 @@ define dso_local void @test3(ptr%buf) nounwind { ; EGPR-NEXT: # fixup A - offset: 1, value: .LBB1_3, kind: FK_PCRel_1 ; EGPR-NEXT: # %bb.1: # %loop.header.preheader ; EGPR-NEXT: movq %rdi, %rbx # encoding: [0x48,0x89,0xfb] -; EGPR-NEXT: movl $32, %r14d # encoding: [0x41,0xbe,0x20,0x00,0x00,0x00] -; EGPR-NEXT: xorl %r15d, %r15d # encoding: [0x45,0x31,0xff] +; EGPR-NEXT: xorl %r14d, %r14d # encoding: [0x45,0x31,0xf6] +; EGPR-NEXT: movl $32, %r15d # encoding: [0x41,0xbf,0x20,0x00,0x00,0x00] ; EGPR-NEXT: .p2align 4 ; EGPR-NEXT: .LBB1_2: # %loop.header ; EGPR-NEXT: # =>This Inner Loop Header: Depth=1 -; EGPR-NEXT: tilestored %tmm0, (%rbx,%r14) # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7a,0x4b,0x04,0x33] +; EGPR-NEXT: tilestored %tmm0, (%rbx,%r15) # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7a,0x4b,0x04,0x3b] ; EGPR-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] ; EGPR-NEXT: tilezero %tmm0 # encoding: [0xc4,0xe2,0x7b,0x49,0xc0] ; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] @@ -310,13 +310,13 @@ define dso_local void @test3(ptr%buf) nounwind { ; EGPR-NEXT: # fixup A - offset: 1, value: foo, kind: reloc_branch_4byte_pcrel ; EGPR-NEXT: ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0x08] ; EGPR-NEXT: tilezero %tmm0 # encoding: [0xc4,0xe2,0x7b,0x49,0xc0] -; EGPR-NEXT: tileloadd (%rbx,%r14), %tmm1 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x0c,0x33] -; EGPR-NEXT: tileloadd (%rbx,%r14), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x14,0x33] +; EGPR-NEXT: tileloadd (%rbx,%r15), %tmm1 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x0c,0x3b] +; EGPR-NEXT: tileloadd (%rbx,%r15), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x14,0x3b] ; EGPR-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 # encoding: [0xc4,0xe2,0x6b,0x5e,0xc1] -; EGPR-NEXT: tilestored %tmm0, (%rbx,%r14) # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7a,0x4b,0x04,0x33] +; EGPR-NEXT: tilestored %tmm0, (%rbx,%r15) # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7a,0x4b,0x04,0x3b] ; EGPR-NEXT: tilezero %tmm0 # encoding: [0xc4,0xe2,0x7b,0x49,0xc0] -; EGPR-NEXT: incl %r15d # encoding: [0x41,0xff,0xc7] -; EGPR-NEXT: cmpw $100, %r15w # encoding: [0x66,0x41,0x83,0xff,0x64] +; EGPR-NEXT: incl %r14d # encoding: [0x41,0xff,0xc6] +; EGPR-NEXT: cmpw $100, %r14w # encoding: [0x66,0x41,0x83,0xfe,0x64] ; EGPR-NEXT: jl .LBB1_2 # encoding: [0x7c,A] ; EGPR-NEXT: # fixup A - offset: 1, value: .LBB1_2, kind: FK_PCRel_1 ; EGPR-NEXT: .LBB1_3: # %exit diff --git a/llvm/test/CodeGen/X86/atomic32.ll b/llvm/test/CodeGen/X86/atomic32.ll index f4666738db7d2..3263b9f61a281 100644 --- a/llvm/test/CodeGen/X86/atomic32.ll +++ b/llvm/test/CodeGen/X86/atomic32.ll @@ -228,9 +228,10 @@ define void @atomic_fetch_nand32(i32 %x) nounwind { ; X64-NEXT: andl %edx, %ecx ; X64-NEXT: notl %ecx ; X64-NEXT: lock cmpxchgl %ecx, sc32(%rip) -; X64-NEXT: sete %cl -; X64-NEXT: testb $1, %cl -; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: movl %eax, %ecx +; X64-NEXT: sete %al +; X64-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: testb $1, %al ; X64-NEXT: jne .LBB5_2 ; X64-NEXT: jmp .LBB5_1 ; X64-NEXT: .LBB5_2: # %atomicrmw.end @@ -251,9 +252,10 @@ define void @atomic_fetch_nand32(i32 %x) nounwind { ; X86-NEXT: andl %edx, %ecx ; X86-NEXT: notl %ecx ; X86-NEXT: lock cmpxchgl %ecx, sc32 -; X86-NEXT: sete %cl -; X86-NEXT: testb $1, %cl -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: sete %al +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: testb $1, %al ; X86-NEXT: jne .LBB5_2 ; X86-NEXT: jmp .LBB5_1 ; X86-NEXT: .LBB5_2: # %atomicrmw.end @@ -277,9 +279,10 @@ define void @atomic_fetch_max32(i32 %x) nounwind { ; X64-NEXT: subl %ecx, %edx ; X64-NEXT: cmovgl %eax, %ecx ; X64-NEXT: lock cmpxchgl %ecx, sc32(%rip) -; X64-NEXT: sete %cl -; X64-NEXT: testb $1, %cl -; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: movl %eax, %ecx +; X64-NEXT: sete %al +; X64-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: testb $1, %al ; X64-NEXT: jne .LBB6_2 ; X64-NEXT: jmp .LBB6_1 ; X64-NEXT: .LBB6_2: # %atomicrmw.end @@ -300,9 +303,10 @@ define void @atomic_fetch_max32(i32 %x) nounwind { ; X86-CMOV-NEXT: subl %ecx, %edx ; X86-CMOV-NEXT: cmovgl %eax, %ecx ; X86-CMOV-NEXT: lock cmpxchgl %ecx, sc32 -; X86-CMOV-NEXT: sete %cl -; X86-CMOV-NEXT: testb $1, %cl -; X86-CMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-CMOV-NEXT: movl %eax, %ecx +; X86-CMOV-NEXT: sete %al +; X86-CMOV-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-CMOV-NEXT: testb $1, %al ; X86-CMOV-NEXT: jne .LBB6_2 ; X86-CMOV-NEXT: jmp .LBB6_1 ; X86-CMOV-NEXT: .LBB6_2: # %atomicrmw.end @@ -334,9 +338,10 @@ define void @atomic_fetch_max32(i32 %x) nounwind { ; X86-NOCMOV-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NOCMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NOCMOV-NEXT: lock cmpxchgl %ecx, sc32 -; X86-NOCMOV-NEXT: sete %cl -; X86-NOCMOV-NEXT: testb $1, %cl -; X86-NOCMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOCMOV-NEXT: movl %eax, %ecx +; X86-NOCMOV-NEXT: sete %al +; X86-NOCMOV-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOCMOV-NEXT: testb $1, %al ; X86-NOCMOV-NEXT: jne .LBB6_2 ; X86-NOCMOV-NEXT: jmp .LBB6_1 ; X86-NOCMOV-NEXT: .LBB6_2: # %atomicrmw.end @@ -368,9 +373,10 @@ define void @atomic_fetch_max32(i32 %x) nounwind { ; X86-NOX87-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NOX87-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NOX87-NEXT: lock cmpxchgl %ecx, sc32 -; X86-NOX87-NEXT: sete %cl -; X86-NOX87-NEXT: testb $1, %cl -; X86-NOX87-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOX87-NEXT: movl %eax, %ecx +; X86-NOX87-NEXT: sete %al +; X86-NOX87-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOX87-NEXT: testb $1, %al ; X86-NOX87-NEXT: jne .LBB6_2 ; X86-NOX87-NEXT: jmp .LBB6_1 ; X86-NOX87-NEXT: .LBB6_2: # %atomicrmw.end @@ -394,9 +400,10 @@ define void @atomic_fetch_min32(i32 %x) nounwind { ; X64-NEXT: subl %ecx, %edx ; X64-NEXT: cmovlel %eax, %ecx ; X64-NEXT: lock cmpxchgl %ecx, sc32(%rip) -; X64-NEXT: sete %cl -; X64-NEXT: testb $1, %cl -; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: movl %eax, %ecx +; X64-NEXT: sete %al +; X64-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: testb $1, %al ; X64-NEXT: jne .LBB7_2 ; X64-NEXT: jmp .LBB7_1 ; X64-NEXT: .LBB7_2: # %atomicrmw.end @@ -417,9 +424,10 @@ define void @atomic_fetch_min32(i32 %x) nounwind { ; X86-CMOV-NEXT: subl %ecx, %edx ; X86-CMOV-NEXT: cmovlel %eax, %ecx ; X86-CMOV-NEXT: lock cmpxchgl %ecx, sc32 -; X86-CMOV-NEXT: sete %cl -; X86-CMOV-NEXT: testb $1, %cl -; X86-CMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-CMOV-NEXT: movl %eax, %ecx +; X86-CMOV-NEXT: sete %al +; X86-CMOV-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-CMOV-NEXT: testb $1, %al ; X86-CMOV-NEXT: jne .LBB7_2 ; X86-CMOV-NEXT: jmp .LBB7_1 ; X86-CMOV-NEXT: .LBB7_2: # %atomicrmw.end @@ -451,9 +459,10 @@ define void @atomic_fetch_min32(i32 %x) nounwind { ; X86-NOCMOV-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NOCMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NOCMOV-NEXT: lock cmpxchgl %ecx, sc32 -; X86-NOCMOV-NEXT: sete %cl -; X86-NOCMOV-NEXT: testb $1, %cl -; X86-NOCMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOCMOV-NEXT: movl %eax, %ecx +; X86-NOCMOV-NEXT: sete %al +; X86-NOCMOV-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOCMOV-NEXT: testb $1, %al ; X86-NOCMOV-NEXT: jne .LBB7_2 ; X86-NOCMOV-NEXT: jmp .LBB7_1 ; X86-NOCMOV-NEXT: .LBB7_2: # %atomicrmw.end @@ -485,9 +494,10 @@ define void @atomic_fetch_min32(i32 %x) nounwind { ; X86-NOX87-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NOX87-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NOX87-NEXT: lock cmpxchgl %ecx, sc32 -; X86-NOX87-NEXT: sete %cl -; X86-NOX87-NEXT: testb $1, %cl -; X86-NOX87-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOX87-NEXT: movl %eax, %ecx +; X86-NOX87-NEXT: sete %al +; X86-NOX87-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOX87-NEXT: testb $1, %al ; X86-NOX87-NEXT: jne .LBB7_2 ; X86-NOX87-NEXT: jmp .LBB7_1 ; X86-NOX87-NEXT: .LBB7_2: # %atomicrmw.end @@ -511,9 +521,10 @@ define void @atomic_fetch_umax32(i32 %x) nounwind { ; X64-NEXT: subl %ecx, %edx ; X64-NEXT: cmoval %eax, %ecx ; X64-NEXT: lock cmpxchgl %ecx, sc32(%rip) -; X64-NEXT: sete %cl -; X64-NEXT: testb $1, %cl -; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: movl %eax, %ecx +; X64-NEXT: sete %al +; X64-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: testb $1, %al ; X64-NEXT: jne .LBB8_2 ; X64-NEXT: jmp .LBB8_1 ; X64-NEXT: .LBB8_2: # %atomicrmw.end @@ -534,9 +545,10 @@ define void @atomic_fetch_umax32(i32 %x) nounwind { ; X86-CMOV-NEXT: subl %ecx, %edx ; X86-CMOV-NEXT: cmoval %eax, %ecx ; X86-CMOV-NEXT: lock cmpxchgl %ecx, sc32 -; X86-CMOV-NEXT: sete %cl -; X86-CMOV-NEXT: testb $1, %cl -; X86-CMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-CMOV-NEXT: movl %eax, %ecx +; X86-CMOV-NEXT: sete %al +; X86-CMOV-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-CMOV-NEXT: testb $1, %al ; X86-CMOV-NEXT: jne .LBB8_2 ; X86-CMOV-NEXT: jmp .LBB8_1 ; X86-CMOV-NEXT: .LBB8_2: # %atomicrmw.end @@ -568,9 +580,10 @@ define void @atomic_fetch_umax32(i32 %x) nounwind { ; X86-NOCMOV-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NOCMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NOCMOV-NEXT: lock cmpxchgl %ecx, sc32 -; X86-NOCMOV-NEXT: sete %cl -; X86-NOCMOV-NEXT: testb $1, %cl -; X86-NOCMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOCMOV-NEXT: movl %eax, %ecx +; X86-NOCMOV-NEXT: sete %al +; X86-NOCMOV-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOCMOV-NEXT: testb $1, %al ; X86-NOCMOV-NEXT: jne .LBB8_2 ; X86-NOCMOV-NEXT: jmp .LBB8_1 ; X86-NOCMOV-NEXT: .LBB8_2: # %atomicrmw.end @@ -602,9 +615,10 @@ define void @atomic_fetch_umax32(i32 %x) nounwind { ; X86-NOX87-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NOX87-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NOX87-NEXT: lock cmpxchgl %ecx, sc32 -; X86-NOX87-NEXT: sete %cl -; X86-NOX87-NEXT: testb $1, %cl -; X86-NOX87-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOX87-NEXT: movl %eax, %ecx +; X86-NOX87-NEXT: sete %al +; X86-NOX87-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOX87-NEXT: testb $1, %al ; X86-NOX87-NEXT: jne .LBB8_2 ; X86-NOX87-NEXT: jmp .LBB8_1 ; X86-NOX87-NEXT: .LBB8_2: # %atomicrmw.end @@ -628,9 +642,10 @@ define void @atomic_fetch_umin32(i32 %x) nounwind { ; X64-NEXT: subl %ecx, %edx ; X64-NEXT: cmovbel %eax, %ecx ; X64-NEXT: lock cmpxchgl %ecx, sc32(%rip) -; X64-NEXT: sete %cl -; X64-NEXT: testb $1, %cl -; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: movl %eax, %ecx +; X64-NEXT: sete %al +; X64-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: testb $1, %al ; X64-NEXT: jne .LBB9_2 ; X64-NEXT: jmp .LBB9_1 ; X64-NEXT: .LBB9_2: # %atomicrmw.end @@ -651,9 +666,10 @@ define void @atomic_fetch_umin32(i32 %x) nounwind { ; X86-CMOV-NEXT: subl %ecx, %edx ; X86-CMOV-NEXT: cmovbel %eax, %ecx ; X86-CMOV-NEXT: lock cmpxchgl %ecx, sc32 -; X86-CMOV-NEXT: sete %cl -; X86-CMOV-NEXT: testb $1, %cl -; X86-CMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-CMOV-NEXT: movl %eax, %ecx +; X86-CMOV-NEXT: sete %al +; X86-CMOV-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-CMOV-NEXT: testb $1, %al ; X86-CMOV-NEXT: jne .LBB9_2 ; X86-CMOV-NEXT: jmp .LBB9_1 ; X86-CMOV-NEXT: .LBB9_2: # %atomicrmw.end @@ -685,9 +701,10 @@ define void @atomic_fetch_umin32(i32 %x) nounwind { ; X86-NOCMOV-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NOCMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NOCMOV-NEXT: lock cmpxchgl %ecx, sc32 -; X86-NOCMOV-NEXT: sete %cl -; X86-NOCMOV-NEXT: testb $1, %cl -; X86-NOCMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOCMOV-NEXT: movl %eax, %ecx +; X86-NOCMOV-NEXT: sete %al +; X86-NOCMOV-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOCMOV-NEXT: testb $1, %al ; X86-NOCMOV-NEXT: jne .LBB9_2 ; X86-NOCMOV-NEXT: jmp .LBB9_1 ; X86-NOCMOV-NEXT: .LBB9_2: # %atomicrmw.end @@ -719,9 +736,10 @@ define void @atomic_fetch_umin32(i32 %x) nounwind { ; X86-NOX87-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NOX87-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NOX87-NEXT: lock cmpxchgl %ecx, sc32 -; X86-NOX87-NEXT: sete %cl -; X86-NOX87-NEXT: testb $1, %cl -; X86-NOX87-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOX87-NEXT: movl %eax, %ecx +; X86-NOX87-NEXT: sete %al +; X86-NOX87-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOX87-NEXT: testb $1, %al ; X86-NOX87-NEXT: jne .LBB9_2 ; X86-NOX87-NEXT: jmp .LBB9_1 ; X86-NOX87-NEXT: .LBB9_2: # %atomicrmw.end diff --git a/llvm/test/CodeGen/X86/atomic64.ll b/llvm/test/CodeGen/X86/atomic64.ll index 8f4da356e06cb..6fc72bcf67ec5 100644 --- a/llvm/test/CodeGen/X86/atomic64.ll +++ b/llvm/test/CodeGen/X86/atomic64.ll @@ -275,9 +275,10 @@ define void @atomic_fetch_nand64(i64 %x) nounwind { ; X64-NEXT: andq %rdx, %rcx ; X64-NEXT: notq %rcx ; X64-NEXT: lock cmpxchgq %rcx, sc64(%rip) -; X64-NEXT: sete %cl -; X64-NEXT: testb $1, %cl -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: sete %al +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: testb $1, %al ; X64-NEXT: jne .LBB5_2 ; X64-NEXT: jmp .LBB5_1 ; X64-NEXT: .LBB5_2: # %atomicrmw.end @@ -314,9 +315,10 @@ define void @atomic_fetch_max64(i64 %x) nounwind { ; X64-NEXT: subq %rcx, %rdx ; X64-NEXT: cmovgq %rax, %rcx ; X64-NEXT: lock cmpxchgq %rcx, sc64(%rip) -; X64-NEXT: sete %cl -; X64-NEXT: testb $1, %cl -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: sete %al +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: testb $1, %al ; X64-NEXT: jne .LBB6_2 ; X64-NEXT: jmp .LBB6_1 ; X64-NEXT: .LBB6_2: # %atomicrmw.end @@ -406,9 +408,10 @@ define void @atomic_fetch_min64(i64 %x) nounwind { ; X64-NEXT: subq %rcx, %rdx ; X64-NEXT: cmovleq %rax, %rcx ; X64-NEXT: lock cmpxchgq %rcx, sc64(%rip) -; X64-NEXT: sete %cl -; X64-NEXT: testb $1, %cl -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: sete %al +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: testb $1, %al ; X64-NEXT: jne .LBB7_2 ; X64-NEXT: jmp .LBB7_1 ; X64-NEXT: .LBB7_2: # %atomicrmw.end @@ -498,9 +501,10 @@ define void @atomic_fetch_umax64(i64 %x) nounwind { ; X64-NEXT: subq %rcx, %rdx ; X64-NEXT: cmovaq %rax, %rcx ; X64-NEXT: lock cmpxchgq %rcx, sc64(%rip) -; X64-NEXT: sete %cl -; X64-NEXT: testb $1, %cl -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: sete %al +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: testb $1, %al ; X64-NEXT: jne .LBB8_2 ; X64-NEXT: jmp .LBB8_1 ; X64-NEXT: .LBB8_2: # %atomicrmw.end @@ -590,9 +594,10 @@ define void @atomic_fetch_umin64(i64 %x) nounwind { ; X64-NEXT: subq %rcx, %rdx ; X64-NEXT: cmovbeq %rax, %rcx ; X64-NEXT: lock cmpxchgq %rcx, sc64(%rip) -; X64-NEXT: sete %cl -; X64-NEXT: testb $1, %cl -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: sete %al +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: testb $1, %al ; X64-NEXT: jne .LBB9_2 ; X64-NEXT: jmp .LBB9_1 ; X64-NEXT: .LBB9_2: # %atomicrmw.end diff --git a/llvm/test/CodeGen/X86/atomic6432.ll b/llvm/test/CodeGen/X86/atomic6432.ll index 8ff5f338e1482..3d0617370a1bb 100644 --- a/llvm/test/CodeGen/X86/atomic6432.ll +++ b/llvm/test/CodeGen/X86/atomic6432.ll @@ -16,15 +16,15 @@ define void @atomic_fetch_add64() nounwind { ; X32-NEXT: jmp .LBB0_1 ; X32-NEXT: .LBB0_1: # %atomicrmw.start14 ; X32-NEXT: # =>This Inner Loop Header: Depth=1 -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: addl $1, %ebx ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: adcl $0, %ecx ; X32-NEXT: lock cmpxchg8b sc64 -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: jne .LBB0_1 ; X32-NEXT: jmp .LBB0_2 ; X32-NEXT: .LBB0_2: # %atomicrmw.end13 @@ -35,15 +35,15 @@ define void @atomic_fetch_add64() nounwind { ; X32-NEXT: jmp .LBB0_3 ; X32-NEXT: .LBB0_3: # %atomicrmw.start8 ; X32-NEXT: # =>This Inner Loop Header: Depth=1 -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: addl $3, %ebx ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: adcl $0, %ecx ; X32-NEXT: lock cmpxchg8b sc64 -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: jne .LBB0_3 ; X32-NEXT: jmp .LBB0_4 ; X32-NEXT: .LBB0_4: # %atomicrmw.end7 @@ -75,8 +75,8 @@ define void @atomic_fetch_add64() nounwind { ; X32-NEXT: jmp .LBB0_7 ; X32-NEXT: .LBB0_7: # %atomicrmw.start ; X32-NEXT: # =>This Inner Loop Header: Depth=1 -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl (%esp), %eax # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %eax, %ebx @@ -84,8 +84,8 @@ define void @atomic_fetch_add64() nounwind { ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: adcl %esi, %ecx ; X32-NEXT: lock cmpxchg8b sc64 -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: jne .LBB0_7 ; X32-NEXT: jmp .LBB0_8 ; X32-NEXT: .LBB0_8: # %atomicrmw.end @@ -114,15 +114,15 @@ define void @atomic_fetch_sub64() nounwind { ; X32-NEXT: jmp .LBB1_1 ; X32-NEXT: .LBB1_1: # %atomicrmw.start14 ; X32-NEXT: # =>This Inner Loop Header: Depth=1 -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: addl $-1, %ebx ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: adcl $-1, %ecx ; X32-NEXT: lock cmpxchg8b sc64 -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: jne .LBB1_1 ; X32-NEXT: jmp .LBB1_2 ; X32-NEXT: .LBB1_2: # %atomicrmw.end13 @@ -133,15 +133,15 @@ define void @atomic_fetch_sub64() nounwind { ; X32-NEXT: jmp .LBB1_3 ; X32-NEXT: .LBB1_3: # %atomicrmw.start8 ; X32-NEXT: # =>This Inner Loop Header: Depth=1 -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: addl $-3, %ebx ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: adcl $-1, %ecx ; X32-NEXT: lock cmpxchg8b sc64 -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: jne .LBB1_3 ; X32-NEXT: jmp .LBB1_4 ; X32-NEXT: .LBB1_4: # %atomicrmw.end7 @@ -173,8 +173,8 @@ define void @atomic_fetch_sub64() nounwind { ; X32-NEXT: jmp .LBB1_7 ; X32-NEXT: .LBB1_7: # %atomicrmw.start ; X32-NEXT: # =>This Inner Loop Header: Depth=1 -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl (%esp), %eax # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %eax, %ebx @@ -182,8 +182,8 @@ define void @atomic_fetch_sub64() nounwind { ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: sbbl %esi, %ecx ; X32-NEXT: lock cmpxchg8b sc64 -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: jne .LBB1_7 ; X32-NEXT: jmp .LBB1_8 ; X32-NEXT: .LBB1_8: # %atomicrmw.end @@ -211,14 +211,14 @@ define void @atomic_fetch_and64() nounwind { ; X32-NEXT: jmp .LBB2_1 ; X32-NEXT: .LBB2_1: # %atomicrmw.start8 ; X32-NEXT: # =>This Inner Loop Header: Depth=1 -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: andl $3, %ebx ; X32-NEXT: xorl %ecx, %ecx ; X32-NEXT: lock cmpxchg8b sc64 -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: jne .LBB2_1 ; X32-NEXT: jmp .LBB2_2 ; X32-NEXT: .LBB2_2: # %atomicrmw.end7 @@ -250,8 +250,8 @@ define void @atomic_fetch_and64() nounwind { ; X32-NEXT: jmp .LBB2_5 ; X32-NEXT: .LBB2_5: # %atomicrmw.start ; X32-NEXT: # =>This Inner Loop Header: Depth=1 -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl (%esp), %eax # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %eax, %ebx @@ -259,8 +259,8 @@ define void @atomic_fetch_and64() nounwind { ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: andl %esi, %ecx ; X32-NEXT: lock cmpxchg8b sc64 -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: jne .LBB2_5 ; X32-NEXT: jmp .LBB2_6 ; X32-NEXT: .LBB2_6: # %atomicrmw.end @@ -287,14 +287,14 @@ define void @atomic_fetch_or64() nounwind { ; X32-NEXT: jmp .LBB3_1 ; X32-NEXT: .LBB3_1: # %atomicrmw.start8 ; X32-NEXT: # =>This Inner Loop Header: Depth=1 -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: orl $3, %ebx ; X32-NEXT: movl %ecx, %edx ; X32-NEXT: lock cmpxchg8b sc64 -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: jne .LBB3_1 ; X32-NEXT: jmp .LBB3_2 ; X32-NEXT: .LBB3_2: # %atomicrmw.end7 @@ -326,8 +326,8 @@ define void @atomic_fetch_or64() nounwind { ; X32-NEXT: jmp .LBB3_5 ; X32-NEXT: .LBB3_5: # %atomicrmw.start ; X32-NEXT: # =>This Inner Loop Header: Depth=1 -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl (%esp), %eax # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %eax, %ebx @@ -335,8 +335,8 @@ define void @atomic_fetch_or64() nounwind { ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: orl %esi, %ecx ; X32-NEXT: lock cmpxchg8b sc64 -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: jne .LBB3_5 ; X32-NEXT: jmp .LBB3_6 ; X32-NEXT: .LBB3_6: # %atomicrmw.end @@ -363,14 +363,14 @@ define void @atomic_fetch_xor64() nounwind { ; X32-NEXT: jmp .LBB4_1 ; X32-NEXT: .LBB4_1: # %atomicrmw.start8 ; X32-NEXT: # =>This Inner Loop Header: Depth=1 -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: xorl $3, %ebx ; X32-NEXT: movl %ecx, %edx ; X32-NEXT: lock cmpxchg8b sc64 -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: jne .LBB4_1 ; X32-NEXT: jmp .LBB4_2 ; X32-NEXT: .LBB4_2: # %atomicrmw.end7 @@ -402,8 +402,8 @@ define void @atomic_fetch_xor64() nounwind { ; X32-NEXT: jmp .LBB4_5 ; X32-NEXT: .LBB4_5: # %atomicrmw.start ; X32-NEXT: # =>This Inner Loop Header: Depth=1 -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl (%esp), %eax # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %eax, %ebx @@ -411,8 +411,8 @@ define void @atomic_fetch_xor64() nounwind { ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: xorl %esi, %ecx ; X32-NEXT: lock cmpxchg8b sc64 -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: jne .LBB4_5 ; X32-NEXT: jmp .LBB4_6 ; X32-NEXT: .LBB4_6: # %atomicrmw.end @@ -444,8 +444,8 @@ define void @atomic_fetch_nand64(i64 %x) nounwind { ; X32-NEXT: jmp .LBB5_1 ; X32-NEXT: .LBB5_1: # %atomicrmw.start ; X32-NEXT: # =>This Inner Loop Header: Depth=1 -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl (%esp), %edi # 4-byte Reload ; X32-NEXT: movl %edx, %ecx @@ -455,8 +455,8 @@ define void @atomic_fetch_nand64(i64 %x) nounwind { ; X32-NEXT: notl %ebx ; X32-NEXT: notl %ecx ; X32-NEXT: lock cmpxchg8b sc64 -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: jne .LBB5_1 ; X32-NEXT: jmp .LBB5_2 ; X32-NEXT: .LBB5_2: # %atomicrmw.end @@ -486,8 +486,8 @@ define void @atomic_fetch_max64(i64 %x) nounwind { ; X32-NEXT: jmp .LBB6_1 ; X32-NEXT: .LBB6_1: # %atomicrmw.start ; X32-NEXT: # =>This Inner Loop Header: Depth=1 -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl (%esp), %ecx # 4-byte Reload ; X32-NEXT: movl %ebx, %esi @@ -497,8 +497,8 @@ define void @atomic_fetch_max64(i64 %x) nounwind { ; X32-NEXT: cmovll %edx, %ecx ; X32-NEXT: cmovll %eax, %ebx ; X32-NEXT: lock cmpxchg8b sc64 -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: jne .LBB6_1 ; X32-NEXT: jmp .LBB6_2 ; X32-NEXT: .LBB6_2: # %atomicrmw.end @@ -527,8 +527,8 @@ define void @atomic_fetch_min64(i64 %x) nounwind { ; X32-NEXT: jmp .LBB7_1 ; X32-NEXT: .LBB7_1: # %atomicrmw.start ; X32-NEXT: # =>This Inner Loop Header: Depth=1 -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl (%esp), %ecx # 4-byte Reload ; X32-NEXT: movl %ebx, %esi @@ -538,8 +538,8 @@ define void @atomic_fetch_min64(i64 %x) nounwind { ; X32-NEXT: cmovgel %edx, %ecx ; X32-NEXT: cmovgel %eax, %ebx ; X32-NEXT: lock cmpxchg8b sc64 -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: jne .LBB7_1 ; X32-NEXT: jmp .LBB7_2 ; X32-NEXT: .LBB7_2: # %atomicrmw.end @@ -568,8 +568,8 @@ define void @atomic_fetch_umax64(i64 %x) nounwind { ; X32-NEXT: jmp .LBB8_1 ; X32-NEXT: .LBB8_1: # %atomicrmw.start ; X32-NEXT: # =>This Inner Loop Header: Depth=1 -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl (%esp), %ecx # 4-byte Reload ; X32-NEXT: movl %ebx, %esi @@ -579,8 +579,8 @@ define void @atomic_fetch_umax64(i64 %x) nounwind { ; X32-NEXT: cmovbl %edx, %ecx ; X32-NEXT: cmovbl %eax, %ebx ; X32-NEXT: lock cmpxchg8b sc64 -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: jne .LBB8_1 ; X32-NEXT: jmp .LBB8_2 ; X32-NEXT: .LBB8_2: # %atomicrmw.end @@ -609,8 +609,8 @@ define void @atomic_fetch_umin64(i64 %x) nounwind { ; X32-NEXT: jmp .LBB9_1 ; X32-NEXT: .LBB9_1: # %atomicrmw.start ; X32-NEXT: # =>This Inner Loop Header: Depth=1 -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl (%esp), %ecx # 4-byte Reload ; X32-NEXT: movl %ebx, %esi @@ -620,8 +620,8 @@ define void @atomic_fetch_umin64(i64 %x) nounwind { ; X32-NEXT: cmovael %edx, %ecx ; X32-NEXT: cmovael %eax, %ebx ; X32-NEXT: lock cmpxchg8b sc64 -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: jne .LBB9_1 ; X32-NEXT: jmp .LBB9_2 ; X32-NEXT: .LBB9_2: # %atomicrmw.end @@ -677,13 +677,13 @@ define void @atomic_fetch_swap64(i64 %x) nounwind { ; X32-NEXT: jmp .LBB12_1 ; X32-NEXT: .LBB12_1: # %atomicrmw.start ; X32-NEXT: # =>This Inner Loop Header: Depth=1 -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl (%esp), %ecx # 4-byte Reload ; X32-NEXT: lock cmpxchg8b sc64 -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: jne .LBB12_1 ; X32-NEXT: jmp .LBB12_2 ; X32-NEXT: .LBB12_2: # %atomicrmw.end diff --git a/llvm/test/CodeGen/X86/callbr-asm-branch-folding.ll b/llvm/test/CodeGen/X86/callbr-asm-branch-folding.ll index 3d389523dffb3..95faca819e975 100644 --- a/llvm/test/CodeGen/X86/callbr-asm-branch-folding.ll +++ b/llvm/test/CodeGen/X86/callbr-asm-branch-folding.ll @@ -17,7 +17,7 @@ define dso_local void @n(ptr %o, i32 %p, i32 %u) nounwind { ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: movl %edx, %ebp -; CHECK-NEXT: movl %esi, %r12d +; CHECK-NEXT: movl %esi, %r13d ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: callq c ; CHECK-NEXT: movl %eax, %r14d @@ -28,17 +28,17 @@ define dso_local void @n(ptr %o, i32 %p, i32 %u) nounwind { ; CHECK-NEXT: # %bb.1: # %if.end ; CHECK-NEXT: cmpl $0, e(%rip) ; CHECK-NEXT: # implicit-def: $r15d -; CHECK-NEXT: # implicit-def: $r13d +; CHECK-NEXT: # implicit-def: $r12d ; CHECK-NEXT: je .LBB0_4 ; CHECK-NEXT: # %bb.2: # %if.then4 -; CHECK-NEXT: movslq %r12d, %rdi +; CHECK-NEXT: movslq %r13d, %rdi ; CHECK-NEXT: callq m ; CHECK-NEXT: # implicit-def: $r15d ; CHECK-NEXT: # implicit-def: $r12d ; CHECK-NEXT: .LBB0_3: # %r ; CHECK-NEXT: callq c -; CHECK-NEXT: movl %r12d, %r13d ; CHECK-NEXT: .LBB0_4: # %if.end8 +; CHECK-NEXT: movl %r12d, %r13d ; CHECK-NEXT: movl %r15d, %edi ; CHECK-NEXT: callq i ; CHECK-NEXT: movl %eax, %r12d diff --git a/llvm/test/CodeGen/X86/callbr-asm-kill.mir b/llvm/test/CodeGen/X86/callbr-asm-kill.mir index 5aabeade52da1..58a9e4de77faf 100644 --- a/llvm/test/CodeGen/X86/callbr-asm-kill.mir +++ b/llvm/test/CodeGen/X86/callbr-asm-kill.mir @@ -6,14 +6,12 @@ # subsequent use of [[MOV64rm]] in the INLINEASM_BR instruction which should be # killed instead. --- | - ; ModuleID = '' source_filename = "" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" declare void @foo(ptr) - ; Function Attrs: nounwind define void @test1(ptr %arg, ptr %mem) #0 { entry: br label %loop @@ -57,18 +55,17 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY killed $rsi ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY killed $rdi - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY killed [[COPY1]] ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.loop (ir-block-address-taken %ir-block.loop, inlineasm-br-indirect-target): ; CHECK-NEXT: successors: %bb.2(0x80000000), %bb.1(0x00000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64 = COPY killed [[COPY2]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY killed [[COPY1]] ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm [[COPY]], 1, $noreg, 0, $noreg :: (load (s64) from %ir.mem) ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp - ; CHECK-NEXT: $rdi = COPY killed [[COPY3]] + ; CHECK-NEXT: $rdi = COPY killed [[COPY2]] ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @foo, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr64 = COPY [[MOV64rm]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY [[MOV64rm]] ; CHECK-NEXT: INLINEASM_BR &"", 9 /* sideeffect mayload attdialect */, 262190 /* mem:m */, killed [[MOV64rm]], 1, $noreg, 0, $noreg, 13 /* imm */, blockaddress(@test1, %ir-block.loop) ; CHECK-NEXT: JMP_1 %bb.2 ; CHECK-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll b/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll index e9f529eea7d3f..fe8d131a977ea 100644 --- a/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll +++ b/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll @@ -49,8 +49,8 @@ define void @foo(ptr %arg3, i1 %icmp16) #0 { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: # %bb5 ; CHECK-NEXT: # in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: orl $1, %r12d ; CHECK-NEXT: movq %r14, %r15 +; CHECK-NEXT: orl $1, %r12d ; CHECK-NEXT: .LBB0_2: # %bb7 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: xorl %eax, %eax diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll index 70335f834291d..845c6cd6f2454 100644 --- a/llvm/test/CodeGen/X86/combine-pmuldq.ll +++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll @@ -328,9 +328,9 @@ define <8 x i32> @PR49658_zext(ptr %ptr, i32 %mul) { ; SSE: # %bb.0: # %start ; SSE-NEXT: movd %esi, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 ; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 +; SSE-NEXT: pxor %xmm0, %xmm0 ; SSE-NEXT: .p2align 4 ; SSE-NEXT: .LBB7_1: # %loop ; SSE-NEXT: # =>This Inner Loop Header: Depth=1 @@ -469,11 +469,11 @@ define <8 x i32> @PR49658_sext(ptr %ptr, i32 %mul) { ; SSE-NEXT: movslq %esi, %rax ; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pxor %xmm0, %xmm0 ; SSE-NEXT: .p2align 4 ; SSE-NEXT: .LBB8_1: # %loop ; SSE-NEXT: # =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/X86/fp128-select.ll b/llvm/test/CodeGen/X86/fp128-select.ll index 659e4ddedc646..e6bf539cc4a12 100644 --- a/llvm/test/CodeGen/X86/fp128-select.ll +++ b/llvm/test/CodeGen/X86/fp128-select.ll @@ -14,7 +14,7 @@ define void @test_select(ptr %p, ptr %q, i1 zeroext %c) nounwind { ; SSE-NEXT: testl %edx, %edx ; SSE-NEXT: jne .LBB0_1 ; SSE-NEXT: # %bb.3: -; SSE-NEXT: movaps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: movaps {{.*#+}} xmm0 = [NaN] ; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: retq ; SSE-NEXT: .LBB0_1: @@ -58,7 +58,7 @@ define fp128 @test_select_cc(fp128, fp128) nounwind { ; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: jmp .LBB1_3 ; SSE-NEXT: .LBB1_1: -; SSE-NEXT: movaps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0] ; SSE-NEXT: .LBB1_3: # %BB0 ; SSE-NEXT: testl %ebx, %ebx ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload @@ -88,17 +88,18 @@ define fp128 @test_select_cc(fp128, fp128) nounwind { ; NOSSE-NEXT: movq %r12, %rdx ; NOSSE-NEXT: movq %r15, %rcx ; NOSSE-NEXT: callq __eqtf2@PLT -; NOSSE-NEXT: movl %eax, %ecx -; NOSSE-NEXT: xorl %eax, %eax -; NOSSE-NEXT: testl %ecx, %ecx -; NOSSE-NEXT: movabsq $4611404543450677248, %rdx # imm = 0x3FFF000000000000 -; NOSSE-NEXT: cmovneq %rax, %rdx +; NOSSE-NEXT: xorl %ecx, %ecx +; NOSSE-NEXT: testl %eax, %eax +; NOSSE-NEXT: movabsq $4611404543450677248, %rax # imm = 0x3FFF000000000000 +; NOSSE-NEXT: cmovneq %rcx, %rax ; NOSSE-NEXT: testl %ebp, %ebp -; NOSSE-NEXT: je .LBB1_2 -; NOSSE-NEXT: # %bb.1: +; NOSSE-NEXT: jne .LBB1_2 +; NOSSE-NEXT: # %bb.1: # %BB1 +; NOSSE-NEXT: xorl %r14d, %r14d +; NOSSE-NEXT: movq %rax, %rbx +; NOSSE-NEXT: .LBB1_2: # %BB2 ; NOSSE-NEXT: movq %r14, %rax ; NOSSE-NEXT: movq %rbx, %rdx -; NOSSE-NEXT: .LBB1_2: # %BB2 ; NOSSE-NEXT: popq %rbx ; NOSSE-NEXT: popq %r12 ; NOSSE-NEXT: popq %r14 diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll index bdb7c307a5759..6ca9e77b9a555 100644 --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -196,10 +196,10 @@ define i32 @_Z10test_shortPsS_i_512(ptr nocapture readonly, ptr nocapture readon ; SSE2-LABEL: _Z10test_shortPsS_i_512: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: xorl %ecx, %ecx ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: .p2align 4 ; SSE2-NEXT: .LBB2_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 @@ -210,19 +210,19 @@ define i32 @_Z10test_shortPsS_i_512(ptr nocapture readonly, ptr nocapture readon ; SSE2-NEXT: paddd %xmm5, %xmm2 ; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm3 ; SSE2-NEXT: pmaddwd %xmm4, %xmm3 -; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: paddd %xmm3, %xmm0 ; SSE2-NEXT: addq $16, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB2_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE2-NEXT: paddd %xmm1, %xmm2 ; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: retq ; ; AVX1-LABEL: _Z10test_shortPsS_i_512: @@ -397,9 +397,9 @@ define i32 @_Z10test_shortPsS_i_1024(ptr nocapture readonly, ptr nocapture reado ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: movl %edx, %eax ; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: xorl %ecx, %ecx ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: .p2align 4 ; AVX1-NEXT: .LBB3_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 @@ -445,8 +445,8 @@ define i32 @_Z10test_shortPsS_i_1024(ptr nocapture readonly, ptr nocapture reado ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: xorl %ecx, %ecx ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: xorl %ecx, %ecx ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: .p2align 4 ; AVX2-NEXT: .LBB3_1: # %vector.body @@ -454,16 +454,16 @@ define i32 @_Z10test_shortPsS_i_1024(ptr nocapture readonly, ptr nocapture reado ; AVX2-NEXT: vmovdqu (%rsi,%rcx,2), %ymm3 ; AVX2-NEXT: vmovdqu 32(%rsi,%rcx,2), %ymm4 ; AVX2-NEXT: vpmaddwd (%rdi,%rcx,2), %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vpmaddwd 32(%rdi,%rcx,2), %ymm4, %ymm3 ; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmaddwd 32(%rdi,%rcx,2), %ymm4, %ymm3 +; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: addq $16, %rcx ; AVX2-NEXT: cmpq %rcx, %rax ; AVX2-NEXT: jne .LBB3_1 ; AVX2-NEXT: # %bb.2: # %middle.block -; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm2 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -770,10 +770,10 @@ define i32 @_Z9test_charPcS_i_512(ptr nocapture readonly, ptr nocapture readonly ; SSE2-LABEL: _Z9test_charPcS_i_512: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: xorl %ecx, %ecx ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: .p2align 4 ; SSE2-NEXT: .LBB6_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 @@ -790,19 +790,19 @@ define i32 @_Z9test_charPcS_i_512(ptr nocapture readonly, ptr nocapture readonly ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm4 ; SSE2-NEXT: pmaddwd %xmm3, %xmm4 -; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: paddd %xmm4, %xmm0 ; SSE2-NEXT: addq $16, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB6_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE2-NEXT: paddd %xmm1, %xmm2 ; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: retq ; ; AVX1-LABEL: _Z9test_charPcS_i_512: @@ -993,9 +993,9 @@ define i32 @_Z9test_charPcS_i_1024(ptr nocapture readonly, ptr nocapture readonl ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: movl %edx, %eax ; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: xorl %ecx, %ecx ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: .p2align 4 ; AVX1-NEXT: .LBB7_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 @@ -1045,8 +1045,8 @@ define i32 @_Z9test_charPcS_i_1024(ptr nocapture readonly, ptr nocapture readonl ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: xorl %ecx, %ecx ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: xorl %ecx, %ecx ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: .p2align 4 ; AVX2-NEXT: .LBB7_1: # %vector.body @@ -1055,17 +1055,17 @@ define i32 @_Z9test_charPcS_i_1024(ptr nocapture readonly, ptr nocapture readonl ; AVX2-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4 ; AVX2-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5 ; AVX2-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 ; AVX2-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: addq $32, %rcx ; AVX2-NEXT: cmpq %rcx, %rax ; AVX2-NEXT: jne .LBB7_1 ; AVX2-NEXT: # %bb.2: # %middle.block -; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm2 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -1271,19 +1271,19 @@ define i32 @test_unsigned_short_256(ptr nocapture readonly, ptr nocapture readon ; SSE2-NEXT: pmullw %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: paddd %xmm2, %xmm1 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: paddd %xmm3, %xmm0 ; SSE2-NEXT: addq $16, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB9_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; ; AVX1-LABEL: test_unsigned_short_256: @@ -1442,22 +1442,22 @@ define i32 @test_unsigned_short_512(ptr nocapture readonly, ptr nocapture readon ; AVX1-NEXT: vpmulld %xmm4, %xmm6, %xmm4 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX1-NEXT: vpmulld %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 ; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpaddd %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: addq $16, %rcx ; AVX1-NEXT: cmpq %rcx, %rax ; AVX1-NEXT: jne .LBB10_1 ; AVX1-NEXT: # %bb.2: # %middle.block -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1480,15 +1480,15 @@ define i32 @test_unsigned_short_512(ptr nocapture readonly, ptr nocapture readon ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmulld %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmulld %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: addq $16, %rcx ; AVX2-NEXT: cmpq %rcx, %rax ; AVX2-NEXT: jne .LBB10_1 ; AVX2-NEXT: # %bb.2: # %middle.block -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -1765,15 +1765,15 @@ define i32 @test_unsigned_short_1024(ptr nocapture readonly, ptr nocapture reado ; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512-NEXT: vpmulld %zmm2, %zmm4, %zmm2 -; AVX512-NEXT: vpaddd %zmm1, %zmm2, %zmm1 +; AVX512-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512-NEXT: vpmulld %zmm3, %zmm2, %zmm2 -; AVX512-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm2, %zmm1 ; AVX512-NEXT: addq $16, %rcx ; AVX512-NEXT: cmpq %rcx, %rax ; AVX512-NEXT: jne .LBB11_1 ; AVX512-NEXT: # %bb.2: # %middle.block -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -2739,9 +2739,9 @@ define i64 @sum_and_sum_of_squares(ptr %a, i32 %n) { ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movl %esi, %eax ; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: .p2align 4 ; SSE2-NEXT: .LBB33_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 @@ -3101,10 +3101,10 @@ define i32 @add_used_by_loop_phi(ptr %a, ptr %b, i64 %offset_a, i64 %offset_b, i ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: addq %rdx, %rdi ; SSE2-NEXT: addq %rcx, %rsi +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: xorl %eax, %eax ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: .p2align 4 ; SSE2-NEXT: .LBB38_1: # %loop ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 @@ -3121,19 +3121,19 @@ define i32 @add_used_by_loop_phi(ptr %a, ptr %b, i64 %offset_a, i64 %offset_b, i ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm3 ; SSE2-NEXT: pmaddwd %xmm4, %xmm3 -; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: paddd %xmm3, %xmm0 ; SSE2-NEXT: addq $16, %rax ; SSE2-NEXT: cmpq %r8, %rax ; SSE2-NEXT: jb .LBB38_1 ; SSE2-NEXT: # %bb.2: # %afterloop -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE2-NEXT: paddd %xmm1, %xmm2 ; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: retq ; ; AVX1-LABEL: add_used_by_loop_phi: diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll index 8c4bab99a5b7b..76a94f825266a 100644 --- a/llvm/test/CodeGen/X86/masked_load.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -226,12 +226,11 @@ define <4 x double> @load_v4f64_v4i32(<4 x i32> %trigger, ptr %addr, <4 x double define <4 x double> @load_v4f64_v4i32_zero(<4 x i32> %trigger, ptr %addr) { ; SSE-LABEL: load_v4f64_v4i32_zero: ; SSE: ## %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE-NEXT: movmskps %xmm1, %eax +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE-NEXT: movmskps %xmm0, %eax ; SSE-NEXT: testb $1, %al -; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: jne LBB3_1 ; SSE-NEXT: ## %bb.2: ## %else ; SSE-NEXT: testb $2, %al @@ -245,7 +244,7 @@ define <4 x double> @load_v4f64_v4i32_zero(<4 x i32> %trigger, ptr %addr) { ; SSE-NEXT: LBB3_8: ## %else8 ; SSE-NEXT: retq ; SSE-NEXT: LBB3_1: ## %cond.load -; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: testb $2, %al ; SSE-NEXT: je LBB3_4 ; SSE-NEXT: LBB3_3: ## %cond.load1 @@ -1096,9 +1095,9 @@ define <8 x float> @load_v8f32_v8i1_zero(<8 x i1> %mask, ptr %addr) { ; SSE2-NEXT: psllw $15, %xmm0 ; SSE2-NEXT: packsswb %xmm0, %xmm0 ; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: jne LBB10_1 ; SSE2-NEXT: ## %bb.2: ## %else ; SSE2-NEXT: testb $2, %al @@ -1175,9 +1174,9 @@ define <8 x float> @load_v8f32_v8i1_zero(<8 x i1> %mask, ptr %addr) { ; SSE42-NEXT: psllw $15, %xmm0 ; SSE42-NEXT: packsswb %xmm0, %xmm0 ; SSE42-NEXT: pmovmskb %xmm0, %eax -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: xorps %xmm1, %xmm1 +; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: pxor %xmm0, %xmm0 ; SSE42-NEXT: jne LBB10_1 ; SSE42-NEXT: ## %bb.2: ## %else ; SSE42-NEXT: testb $2, %al @@ -2614,9 +2613,9 @@ define <8 x i32> @load_v8i32_v8i1_zero(<8 x i1> %mask, ptr %addr) { ; SSE2-NEXT: psllw $15, %xmm0 ; SSE2-NEXT: packsswb %xmm0, %xmm0 ; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: jne LBB20_1 ; SSE2-NEXT: ## %bb.2: ## %else ; SSE2-NEXT: testb $2, %al @@ -2693,9 +2692,9 @@ define <8 x i32> @load_v8i32_v8i1_zero(<8 x i1> %mask, ptr %addr) { ; SSE42-NEXT: psllw $15, %xmm0 ; SSE42-NEXT: packsswb %xmm0, %xmm0 ; SSE42-NEXT: pmovmskb %xmm0, %eax -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: pxor %xmm0, %xmm0 ; SSE42-NEXT: jne LBB20_1 ; SSE42-NEXT: ## %bb.2: ## %else ; SSE42-NEXT: testb $2, %al diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll index d752659f94a50..0ebe7d4ea588d 100644 --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -174,8 +174,8 @@ define dso_local i32 @_Z9test_charPcS_i_256(ptr nocapture readonly, ptr nocaptur ; CHECK-SKX: # %bb.0: # %entry ; CHECK-SKX-NEXT: movl %edx, %eax ; CHECK-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-SKX-NEXT: xorl %ecx, %ecx ; CHECK-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-SKX-NEXT: xorl %ecx, %ecx ; CHECK-SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-SKX-NEXT: .p2align 4 ; CHECK-SKX-NEXT: .LBB8_1: # %vector.body @@ -184,17 +184,17 @@ define dso_local i32 @_Z9test_charPcS_i_256(ptr nocapture readonly, ptr nocaptur ; CHECK-SKX-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4 ; CHECK-SKX-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5 ; CHECK-SKX-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 -; CHECK-SKX-NEXT: vpaddd %ymm2, %ymm3, %ymm2 +; CHECK-SKX-NEXT: vpaddd %ymm1, %ymm3, %ymm1 ; CHECK-SKX-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 ; CHECK-SKX-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 -; CHECK-SKX-NEXT: vpaddd %ymm1, %ymm3, %ymm1 +; CHECK-SKX-NEXT: vpaddd %ymm2, %ymm3, %ymm2 ; CHECK-SKX-NEXT: addq $32, %rcx ; CHECK-SKX-NEXT: cmpq %rcx, %rax ; CHECK-SKX-NEXT: jne .LBB8_1 ; CHECK-SKX-NEXT: # %bb.2: # %middle.block -; CHECK-SKX-NEXT: vpaddd %ymm0, %ymm1, %ymm1 -; CHECK-SKX-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; CHECK-SKX-NEXT: vpaddd %ymm0, %ymm2, %ymm2 ; CHECK-SKX-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-SKX-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; CHECK-SKX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -209,8 +209,8 @@ define dso_local i32 @_Z9test_charPcS_i_256(ptr nocapture readonly, ptr nocaptur ; CHECK-AVX512: # %bb.0: # %entry ; CHECK-AVX512-NEXT: movl %edx, %eax ; CHECK-AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-AVX512-NEXT: xorl %ecx, %ecx ; CHECK-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512-NEXT: xorl %ecx, %ecx ; CHECK-AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-AVX512-NEXT: .p2align 4 ; CHECK-AVX512-NEXT: .LBB8_1: # %vector.body @@ -219,17 +219,17 @@ define dso_local i32 @_Z9test_charPcS_i_256(ptr nocapture readonly, ptr nocaptur ; CHECK-AVX512-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4 ; CHECK-AVX512-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5 ; CHECK-AVX512-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 -; CHECK-AVX512-NEXT: vpaddd %ymm2, %ymm3, %ymm2 +; CHECK-AVX512-NEXT: vpaddd %ymm1, %ymm3, %ymm1 ; CHECK-AVX512-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 ; CHECK-AVX512-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 -; CHECK-AVX512-NEXT: vpaddd %ymm1, %ymm3, %ymm1 +; CHECK-AVX512-NEXT: vpaddd %ymm2, %ymm3, %ymm2 ; CHECK-AVX512-NEXT: addq $32, %rcx ; CHECK-AVX512-NEXT: cmpq %rcx, %rax ; CHECK-AVX512-NEXT: jne .LBB8_1 ; CHECK-AVX512-NEXT: # %bb.2: # %middle.block -; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm1 -; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm2 ; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; CHECK-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -244,8 +244,8 @@ define dso_local i32 @_Z9test_charPcS_i_256(ptr nocapture readonly, ptr nocaptur ; CHECK-VBMI: # %bb.0: # %entry ; CHECK-VBMI-NEXT: movl %edx, %eax ; CHECK-VBMI-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-VBMI-NEXT: xorl %ecx, %ecx ; CHECK-VBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-VBMI-NEXT: xorl %ecx, %ecx ; CHECK-VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-VBMI-NEXT: .p2align 4 ; CHECK-VBMI-NEXT: .LBB8_1: # %vector.body @@ -254,17 +254,17 @@ define dso_local i32 @_Z9test_charPcS_i_256(ptr nocapture readonly, ptr nocaptur ; CHECK-VBMI-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4 ; CHECK-VBMI-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5 ; CHECK-VBMI-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 -; CHECK-VBMI-NEXT: vpaddd %ymm2, %ymm3, %ymm2 +; CHECK-VBMI-NEXT: vpaddd %ymm1, %ymm3, %ymm1 ; CHECK-VBMI-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 ; CHECK-VBMI-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 -; CHECK-VBMI-NEXT: vpaddd %ymm1, %ymm3, %ymm1 +; CHECK-VBMI-NEXT: vpaddd %ymm2, %ymm3, %ymm2 ; CHECK-VBMI-NEXT: addq $32, %rcx ; CHECK-VBMI-NEXT: cmpq %rcx, %rax ; CHECK-VBMI-NEXT: jne .LBB8_1 ; CHECK-VBMI-NEXT: # %bb.2: # %middle.block -; CHECK-VBMI-NEXT: vpaddd %ymm0, %ymm1, %ymm1 -; CHECK-VBMI-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; CHECK-VBMI-NEXT: vpaddd %ymm0, %ymm2, %ymm2 ; CHECK-VBMI-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-VBMI-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; CHECK-VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1 ; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] diff --git a/llvm/test/CodeGen/X86/pcsections-atomics.ll b/llvm/test/CodeGen/X86/pcsections-atomics.ll index 69ae1f19f3200..df74b1b7395cc 100644 --- a/llvm/test/CodeGen/X86/pcsections-atomics.ll +++ b/llvm/test/CodeGen/X86/pcsections-atomics.ll @@ -823,11 +823,12 @@ define void @atomic8_nand_monotonic(ptr %a) { ; O0-NEXT: orb $-43, %dl ; O0-NEXT: .Lpcsection19: ; O0-NEXT: lock cmpxchgb %dl, (%rcx) +; O0-NEXT: movb %al, %cl ; O0-NEXT: .Lpcsection20: -; O0-NEXT: sete %cl +; O0-NEXT: sete %al +; O0-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; O0-NEXT: .Lpcsection21: -; O0-NEXT: testb $1, %cl -; O0-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; O0-NEXT: testb $1, %al ; O0-NEXT: .Lpcsection22: ; O0-NEXT: jne .LBB16_2 ; O0-NEXT: jmp .LBB16_1 @@ -1231,11 +1232,12 @@ define void @atomic8_nand_acquire(ptr %a) { ; O0-NEXT: orb $-43, %dl ; O0-NEXT: .Lpcsection32: ; O0-NEXT: lock cmpxchgb %dl, (%rcx) +; O0-NEXT: movb %al, %cl ; O0-NEXT: .Lpcsection33: -; O0-NEXT: sete %cl +; O0-NEXT: sete %al +; O0-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; O0-NEXT: .Lpcsection34: -; O0-NEXT: testb $1, %cl -; O0-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; O0-NEXT: testb $1, %al ; O0-NEXT: .Lpcsection35: ; O0-NEXT: jne .LBB23_2 ; O0-NEXT: jmp .LBB23_1 @@ -1639,11 +1641,12 @@ define void @atomic8_nand_release(ptr %a) { ; O0-NEXT: orb $-43, %dl ; O0-NEXT: .Lpcsection45: ; O0-NEXT: lock cmpxchgb %dl, (%rcx) +; O0-NEXT: movb %al, %cl ; O0-NEXT: .Lpcsection46: -; O0-NEXT: sete %cl +; O0-NEXT: sete %al +; O0-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; O0-NEXT: .Lpcsection47: -; O0-NEXT: testb $1, %cl -; O0-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; O0-NEXT: testb $1, %al ; O0-NEXT: .Lpcsection48: ; O0-NEXT: jne .LBB30_2 ; O0-NEXT: jmp .LBB30_1 @@ -2047,11 +2050,12 @@ define void @atomic8_nand_acq_rel(ptr %a) { ; O0-NEXT: orb $-43, %dl ; O0-NEXT: .Lpcsection58: ; O0-NEXT: lock cmpxchgb %dl, (%rcx) +; O0-NEXT: movb %al, %cl ; O0-NEXT: .Lpcsection59: -; O0-NEXT: sete %cl +; O0-NEXT: sete %al +; O0-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; O0-NEXT: .Lpcsection60: -; O0-NEXT: testb $1, %cl -; O0-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; O0-NEXT: testb $1, %al ; O0-NEXT: .Lpcsection61: ; O0-NEXT: jne .LBB37_2 ; O0-NEXT: jmp .LBB37_1 @@ -2455,11 +2459,12 @@ define void @atomic8_nand_seq_cst(ptr %a) { ; O0-NEXT: orb $-43, %dl ; O0-NEXT: .Lpcsection71: ; O0-NEXT: lock cmpxchgb %dl, (%rcx) +; O0-NEXT: movb %al, %cl ; O0-NEXT: .Lpcsection72: -; O0-NEXT: sete %cl +; O0-NEXT: sete %al +; O0-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; O0-NEXT: .Lpcsection73: -; O0-NEXT: testb $1, %cl -; O0-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; O0-NEXT: testb $1, %al ; O0-NEXT: .Lpcsection74: ; O0-NEXT: jne .LBB44_2 ; O0-NEXT: jmp .LBB44_1 @@ -3778,11 +3783,12 @@ define void @atomic16_nand_monotonic(ptr %a) { ; O0-NEXT: # kill: def $dx killed $dx killed $edx ; O0-NEXT: .Lpcsection119: ; O0-NEXT: lock cmpxchgw %dx, (%rcx) +; O0-NEXT: movw %ax, %cx ; O0-NEXT: .Lpcsection120: -; O0-NEXT: sete %cl +; O0-NEXT: sete %al +; O0-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; O0-NEXT: .Lpcsection121: -; O0-NEXT: testb $1, %cl -; O0-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; O0-NEXT: testb $1, %al ; O0-NEXT: .Lpcsection122: ; O0-NEXT: jne .LBB64_2 ; O0-NEXT: jmp .LBB64_1 @@ -4206,11 +4212,12 @@ define void @atomic16_nand_acquire(ptr %a) { ; O0-NEXT: # kill: def $dx killed $dx killed $edx ; O0-NEXT: .Lpcsection134: ; O0-NEXT: lock cmpxchgw %dx, (%rcx) +; O0-NEXT: movw %ax, %cx ; O0-NEXT: .Lpcsection135: -; O0-NEXT: sete %cl +; O0-NEXT: sete %al +; O0-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; O0-NEXT: .Lpcsection136: -; O0-NEXT: testb $1, %cl -; O0-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; O0-NEXT: testb $1, %al ; O0-NEXT: .Lpcsection137: ; O0-NEXT: jne .LBB71_2 ; O0-NEXT: jmp .LBB71_1 @@ -4634,11 +4641,12 @@ define void @atomic16_nand_release(ptr %a) { ; O0-NEXT: # kill: def $dx killed $dx killed $edx ; O0-NEXT: .Lpcsection149: ; O0-NEXT: lock cmpxchgw %dx, (%rcx) +; O0-NEXT: movw %ax, %cx ; O0-NEXT: .Lpcsection150: -; O0-NEXT: sete %cl +; O0-NEXT: sete %al +; O0-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; O0-NEXT: .Lpcsection151: -; O0-NEXT: testb $1, %cl -; O0-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; O0-NEXT: testb $1, %al ; O0-NEXT: .Lpcsection152: ; O0-NEXT: jne .LBB78_2 ; O0-NEXT: jmp .LBB78_1 @@ -5062,11 +5070,12 @@ define void @atomic16_nand_acq_rel(ptr %a) { ; O0-NEXT: # kill: def $dx killed $dx killed $edx ; O0-NEXT: .Lpcsection164: ; O0-NEXT: lock cmpxchgw %dx, (%rcx) +; O0-NEXT: movw %ax, %cx ; O0-NEXT: .Lpcsection165: -; O0-NEXT: sete %cl +; O0-NEXT: sete %al +; O0-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; O0-NEXT: .Lpcsection166: -; O0-NEXT: testb $1, %cl -; O0-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; O0-NEXT: testb $1, %al ; O0-NEXT: .Lpcsection167: ; O0-NEXT: jne .LBB85_2 ; O0-NEXT: jmp .LBB85_1 @@ -5490,11 +5499,12 @@ define void @atomic16_nand_seq_cst(ptr %a) { ; O0-NEXT: # kill: def $dx killed $dx killed $edx ; O0-NEXT: .Lpcsection179: ; O0-NEXT: lock cmpxchgw %dx, (%rcx) +; O0-NEXT: movw %ax, %cx ; O0-NEXT: .Lpcsection180: -; O0-NEXT: sete %cl +; O0-NEXT: sete %al +; O0-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; O0-NEXT: .Lpcsection181: -; O0-NEXT: testb $1, %cl -; O0-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; O0-NEXT: testb $1, %al ; O0-NEXT: .Lpcsection182: ; O0-NEXT: jne .LBB92_2 ; O0-NEXT: jmp .LBB92_1 @@ -6765,11 +6775,12 @@ define void @atomic32_nand_monotonic(ptr %a) { ; O0-NEXT: orl $-43, %edx ; O0-NEXT: .Lpcsection225: ; O0-NEXT: lock cmpxchgl %edx, (%rcx) +; O0-NEXT: movl %eax, %ecx ; O0-NEXT: .Lpcsection226: -; O0-NEXT: sete %cl +; O0-NEXT: sete %al +; O0-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; O0-NEXT: .Lpcsection227: -; O0-NEXT: testb $1, %cl -; O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; O0-NEXT: testb $1, %al ; O0-NEXT: .Lpcsection228: ; O0-NEXT: jne .LBB112_2 ; O0-NEXT: jmp .LBB112_1 @@ -7173,11 +7184,12 @@ define void @atomic32_nand_acquire(ptr %a) { ; O0-NEXT: orl $-43, %edx ; O0-NEXT: .Lpcsection238: ; O0-NEXT: lock cmpxchgl %edx, (%rcx) +; O0-NEXT: movl %eax, %ecx ; O0-NEXT: .Lpcsection239: -; O0-NEXT: sete %cl +; O0-NEXT: sete %al +; O0-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; O0-NEXT: .Lpcsection240: -; O0-NEXT: testb $1, %cl -; O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; O0-NEXT: testb $1, %al ; O0-NEXT: .Lpcsection241: ; O0-NEXT: jne .LBB119_2 ; O0-NEXT: jmp .LBB119_1 @@ -7581,11 +7593,12 @@ define void @atomic32_nand_release(ptr %a) { ; O0-NEXT: orl $-43, %edx ; O0-NEXT: .Lpcsection251: ; O0-NEXT: lock cmpxchgl %edx, (%rcx) +; O0-NEXT: movl %eax, %ecx ; O0-NEXT: .Lpcsection252: -; O0-NEXT: sete %cl +; O0-NEXT: sete %al +; O0-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; O0-NEXT: .Lpcsection253: -; O0-NEXT: testb $1, %cl -; O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; O0-NEXT: testb $1, %al ; O0-NEXT: .Lpcsection254: ; O0-NEXT: jne .LBB126_2 ; O0-NEXT: jmp .LBB126_1 @@ -7989,11 +8002,12 @@ define void @atomic32_nand_acq_rel(ptr %a) { ; O0-NEXT: orl $-43, %edx ; O0-NEXT: .Lpcsection264: ; O0-NEXT: lock cmpxchgl %edx, (%rcx) +; O0-NEXT: movl %eax, %ecx ; O0-NEXT: .Lpcsection265: -; O0-NEXT: sete %cl +; O0-NEXT: sete %al +; O0-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; O0-NEXT: .Lpcsection266: -; O0-NEXT: testb $1, %cl -; O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; O0-NEXT: testb $1, %al ; O0-NEXT: .Lpcsection267: ; O0-NEXT: jne .LBB133_2 ; O0-NEXT: jmp .LBB133_1 @@ -8397,11 +8411,12 @@ define void @atomic32_nand_seq_cst(ptr %a) { ; O0-NEXT: orl $-43, %edx ; O0-NEXT: .Lpcsection277: ; O0-NEXT: lock cmpxchgl %edx, (%rcx) +; O0-NEXT: movl %eax, %ecx ; O0-NEXT: .Lpcsection278: -; O0-NEXT: sete %cl +; O0-NEXT: sete %al +; O0-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; O0-NEXT: .Lpcsection279: -; O0-NEXT: testb $1, %cl -; O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; O0-NEXT: testb $1, %al ; O0-NEXT: .Lpcsection280: ; O0-NEXT: jne .LBB140_2 ; O0-NEXT: jmp .LBB140_1 @@ -9813,11 +9828,12 @@ define void @atomic64_nand_monotonic(ptr %a) { ; O0-NEXT: orq $-43, %rdx ; O0-NEXT: .Lpcsection326: ; O0-NEXT: lock cmpxchgq %rdx, (%rcx) +; O0-NEXT: movq %rax, %rcx ; O0-NEXT: .Lpcsection327: -; O0-NEXT: sete %cl +; O0-NEXT: sete %al +; O0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection328: -; O0-NEXT: testb $1, %cl -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: testb $1, %al ; O0-NEXT: .Lpcsection329: ; O0-NEXT: jne .LBB162_2 ; O0-NEXT: jmp .LBB162_1 @@ -10224,11 +10240,12 @@ define void @atomic64_nand_acquire(ptr %a) { ; O0-NEXT: orq $-43, %rdx ; O0-NEXT: .Lpcsection340: ; O0-NEXT: lock cmpxchgq %rdx, (%rcx) +; O0-NEXT: movq %rax, %rcx ; O0-NEXT: .Lpcsection341: -; O0-NEXT: sete %cl +; O0-NEXT: sete %al +; O0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection342: -; O0-NEXT: testb $1, %cl -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: testb $1, %al ; O0-NEXT: .Lpcsection343: ; O0-NEXT: jne .LBB169_2 ; O0-NEXT: jmp .LBB169_1 @@ -10635,11 +10652,12 @@ define void @atomic64_nand_release(ptr %a) { ; O0-NEXT: orq $-43, %rdx ; O0-NEXT: .Lpcsection354: ; O0-NEXT: lock cmpxchgq %rdx, (%rcx) +; O0-NEXT: movq %rax, %rcx ; O0-NEXT: .Lpcsection355: -; O0-NEXT: sete %cl +; O0-NEXT: sete %al +; O0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection356: -; O0-NEXT: testb $1, %cl -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: testb $1, %al ; O0-NEXT: .Lpcsection357: ; O0-NEXT: jne .LBB176_2 ; O0-NEXT: jmp .LBB176_1 @@ -11046,11 +11064,12 @@ define void @atomic64_nand_acq_rel(ptr %a) { ; O0-NEXT: orq $-43, %rdx ; O0-NEXT: .Lpcsection368: ; O0-NEXT: lock cmpxchgq %rdx, (%rcx) +; O0-NEXT: movq %rax, %rcx ; O0-NEXT: .Lpcsection369: -; O0-NEXT: sete %cl +; O0-NEXT: sete %al +; O0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection370: -; O0-NEXT: testb $1, %cl -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: testb $1, %al ; O0-NEXT: .Lpcsection371: ; O0-NEXT: jne .LBB183_2 ; O0-NEXT: jmp .LBB183_1 @@ -11457,11 +11476,12 @@ define void @atomic64_nand_seq_cst(ptr %a) { ; O0-NEXT: orq $-43, %rdx ; O0-NEXT: .Lpcsection382: ; O0-NEXT: lock cmpxchgq %rdx, (%rcx) +; O0-NEXT: movq %rax, %rcx ; O0-NEXT: .Lpcsection383: -; O0-NEXT: sete %cl +; O0-NEXT: sete %al +; O0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection384: -; O0-NEXT: testb $1, %cl -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: testb $1, %al ; O0-NEXT: .Lpcsection385: ; O0-NEXT: jne .LBB190_2 ; O0-NEXT: jmp .LBB190_1 @@ -12685,8 +12705,8 @@ define void @atomic128_store_unordered(ptr %a) { ; O0-NEXT: jmp .LBB203_1 ; O0-NEXT: .LBB203_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: .Lpcsection426: ; O0-NEXT: xorl %ecx, %ecx @@ -12696,8 +12716,8 @@ define void @atomic128_store_unordered(ptr %a) { ; O0-NEXT: movl $42, %ebx ; O0-NEXT: .Lpcsection429: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection430: ; O0-NEXT: jne .LBB203_1 ; O0-NEXT: jmp .LBB203_2 @@ -12822,8 +12842,8 @@ define void @atomic128_store_monotonic(ptr %a) { ; O0-NEXT: jmp .LBB204_1 ; O0-NEXT: .LBB204_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: .Lpcsection434: ; O0-NEXT: xorl %ecx, %ecx @@ -12833,8 +12853,8 @@ define void @atomic128_store_monotonic(ptr %a) { ; O0-NEXT: movl $42, %ebx ; O0-NEXT: .Lpcsection437: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection438: ; O0-NEXT: jne .LBB204_1 ; O0-NEXT: jmp .LBB204_2 @@ -12959,8 +12979,8 @@ define void @atomic128_store_release(ptr %a) { ; O0-NEXT: jmp .LBB205_1 ; O0-NEXT: .LBB205_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: .Lpcsection442: ; O0-NEXT: xorl %ecx, %ecx @@ -12970,8 +12990,8 @@ define void @atomic128_store_release(ptr %a) { ; O0-NEXT: movl $42, %ebx ; O0-NEXT: .Lpcsection445: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection446: ; O0-NEXT: jne .LBB205_1 ; O0-NEXT: jmp .LBB205_2 @@ -13096,8 +13116,8 @@ define void @atomic128_store_seq_cst(ptr %a) { ; O0-NEXT: jmp .LBB206_1 ; O0-NEXT: .LBB206_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: .Lpcsection450: ; O0-NEXT: xorl %ecx, %ecx @@ -13107,8 +13127,8 @@ define void @atomic128_store_seq_cst(ptr %a) { ; O0-NEXT: movl $42, %ebx ; O0-NEXT: .Lpcsection453: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection454: ; O0-NEXT: jne .LBB206_1 ; O0-NEXT: jmp .LBB206_2 @@ -13282,8 +13302,8 @@ define void @atomic128_xchg_monotonic(ptr %a) { ; O0-NEXT: jmp .LBB208_1 ; O0-NEXT: .LBB208_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: .Lpcsection459: ; O0-NEXT: xorl %ecx, %ecx @@ -13293,8 +13313,8 @@ define void @atomic128_xchg_monotonic(ptr %a) { ; O0-NEXT: movl $42, %ebx ; O0-NEXT: .Lpcsection462: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection463: ; O0-NEXT: jne .LBB208_1 ; O0-NEXT: jmp .LBB208_2 @@ -13436,8 +13456,8 @@ define void @atomic128_add_monotonic(ptr %a) { ; O0-NEXT: jmp .LBB209_1 ; O0-NEXT: .LBB209_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: movq %rax, %rbx ; O0-NEXT: .Lpcsection467: @@ -13447,8 +13467,8 @@ define void @atomic128_add_monotonic(ptr %a) { ; O0-NEXT: adcq $0, %rcx ; O0-NEXT: .Lpcsection469: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection470: ; O0-NEXT: jne .LBB209_1 ; O0-NEXT: jmp .LBB209_2 @@ -13598,8 +13618,8 @@ define void @atomic128_sub_monotonic(ptr %a) { ; O0-NEXT: jmp .LBB210_1 ; O0-NEXT: .LBB210_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: movq %rax, %rbx ; O0-NEXT: .Lpcsection474: @@ -13609,8 +13629,8 @@ define void @atomic128_sub_monotonic(ptr %a) { ; O0-NEXT: adcq $-1, %rcx ; O0-NEXT: .Lpcsection476: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection477: ; O0-NEXT: jne .LBB210_1 ; O0-NEXT: jmp .LBB210_2 @@ -13760,8 +13780,8 @@ define void @atomic128_and_monotonic(ptr %a) { ; O0-NEXT: jmp .LBB211_1 ; O0-NEXT: .LBB211_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: movl %eax, %ecx ; O0-NEXT: .Lpcsection481: @@ -13773,8 +13793,8 @@ define void @atomic128_and_monotonic(ptr %a) { ; O0-NEXT: # kill: def $rcx killed $ecx ; O0-NEXT: .Lpcsection484: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection485: ; O0-NEXT: jne .LBB211_1 ; O0-NEXT: jmp .LBB211_2 @@ -13920,8 +13940,8 @@ define void @atomic128_or_monotonic(ptr %a) { ; O0-NEXT: jmp .LBB212_1 ; O0-NEXT: .LBB212_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: movq %rax, %rbx ; O0-NEXT: .Lpcsection489: @@ -13929,8 +13949,8 @@ define void @atomic128_or_monotonic(ptr %a) { ; O0-NEXT: movq %rcx, %rdx ; O0-NEXT: .Lpcsection490: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection491: ; O0-NEXT: jne .LBB212_1 ; O0-NEXT: jmp .LBB212_2 @@ -14072,8 +14092,8 @@ define void @atomic128_xor_monotonic(ptr %a) { ; O0-NEXT: jmp .LBB213_1 ; O0-NEXT: .LBB213_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: movq %rax, %rbx ; O0-NEXT: .Lpcsection495: @@ -14081,8 +14101,8 @@ define void @atomic128_xor_monotonic(ptr %a) { ; O0-NEXT: movq %rcx, %rdx ; O0-NEXT: .Lpcsection496: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection497: ; O0-NEXT: jne .LBB213_1 ; O0-NEXT: jmp .LBB213_2 @@ -14224,8 +14244,8 @@ define void @atomic128_nand_monotonic(ptr %a) { ; O0-NEXT: jmp .LBB214_1 ; O0-NEXT: .LBB214_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: movl %eax, %ecx ; O0-NEXT: .Lpcsection501: @@ -14239,8 +14259,8 @@ define void @atomic128_nand_monotonic(ptr %a) { ; O0-NEXT: movq $-1, %rcx ; O0-NEXT: .Lpcsection505: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection506: ; O0-NEXT: jne .LBB214_1 ; O0-NEXT: jmp .LBB214_2 @@ -14394,8 +14414,8 @@ define void @atomic128_xchg_acquire(ptr %a) { ; O0-NEXT: jmp .LBB215_1 ; O0-NEXT: .LBB215_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: .Lpcsection510: ; O0-NEXT: xorl %ecx, %ecx @@ -14405,8 +14425,8 @@ define void @atomic128_xchg_acquire(ptr %a) { ; O0-NEXT: movl $42, %ebx ; O0-NEXT: .Lpcsection513: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection514: ; O0-NEXT: jne .LBB215_1 ; O0-NEXT: jmp .LBB215_2 @@ -14548,8 +14568,8 @@ define void @atomic128_add_acquire(ptr %a) { ; O0-NEXT: jmp .LBB216_1 ; O0-NEXT: .LBB216_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: movq %rax, %rbx ; O0-NEXT: .Lpcsection518: @@ -14559,8 +14579,8 @@ define void @atomic128_add_acquire(ptr %a) { ; O0-NEXT: adcq $0, %rcx ; O0-NEXT: .Lpcsection520: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection521: ; O0-NEXT: jne .LBB216_1 ; O0-NEXT: jmp .LBB216_2 @@ -14710,8 +14730,8 @@ define void @atomic128_sub_acquire(ptr %a) { ; O0-NEXT: jmp .LBB217_1 ; O0-NEXT: .LBB217_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: movq %rax, %rbx ; O0-NEXT: .Lpcsection525: @@ -14721,8 +14741,8 @@ define void @atomic128_sub_acquire(ptr %a) { ; O0-NEXT: adcq $-1, %rcx ; O0-NEXT: .Lpcsection527: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection528: ; O0-NEXT: jne .LBB217_1 ; O0-NEXT: jmp .LBB217_2 @@ -14872,8 +14892,8 @@ define void @atomic128_and_acquire(ptr %a) { ; O0-NEXT: jmp .LBB218_1 ; O0-NEXT: .LBB218_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: movl %eax, %ecx ; O0-NEXT: .Lpcsection532: @@ -14885,8 +14905,8 @@ define void @atomic128_and_acquire(ptr %a) { ; O0-NEXT: # kill: def $rcx killed $ecx ; O0-NEXT: .Lpcsection535: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection536: ; O0-NEXT: jne .LBB218_1 ; O0-NEXT: jmp .LBB218_2 @@ -15032,8 +15052,8 @@ define void @atomic128_or_acquire(ptr %a) { ; O0-NEXT: jmp .LBB219_1 ; O0-NEXT: .LBB219_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: movq %rax, %rbx ; O0-NEXT: .Lpcsection540: @@ -15041,8 +15061,8 @@ define void @atomic128_or_acquire(ptr %a) { ; O0-NEXT: movq %rcx, %rdx ; O0-NEXT: .Lpcsection541: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection542: ; O0-NEXT: jne .LBB219_1 ; O0-NEXT: jmp .LBB219_2 @@ -15184,8 +15204,8 @@ define void @atomic128_xor_acquire(ptr %a) { ; O0-NEXT: jmp .LBB220_1 ; O0-NEXT: .LBB220_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: movq %rax, %rbx ; O0-NEXT: .Lpcsection546: @@ -15193,8 +15213,8 @@ define void @atomic128_xor_acquire(ptr %a) { ; O0-NEXT: movq %rcx, %rdx ; O0-NEXT: .Lpcsection547: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection548: ; O0-NEXT: jne .LBB220_1 ; O0-NEXT: jmp .LBB220_2 @@ -15336,8 +15356,8 @@ define void @atomic128_nand_acquire(ptr %a) { ; O0-NEXT: jmp .LBB221_1 ; O0-NEXT: .LBB221_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: movl %eax, %ecx ; O0-NEXT: .Lpcsection552: @@ -15351,8 +15371,8 @@ define void @atomic128_nand_acquire(ptr %a) { ; O0-NEXT: movq $-1, %rcx ; O0-NEXT: .Lpcsection556: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection557: ; O0-NEXT: jne .LBB221_1 ; O0-NEXT: jmp .LBB221_2 @@ -15506,8 +15526,8 @@ define void @atomic128_xchg_release(ptr %a) { ; O0-NEXT: jmp .LBB222_1 ; O0-NEXT: .LBB222_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: .Lpcsection561: ; O0-NEXT: xorl %ecx, %ecx @@ -15517,8 +15537,8 @@ define void @atomic128_xchg_release(ptr %a) { ; O0-NEXT: movl $42, %ebx ; O0-NEXT: .Lpcsection564: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection565: ; O0-NEXT: jne .LBB222_1 ; O0-NEXT: jmp .LBB222_2 @@ -15659,8 +15679,8 @@ define void @atomic128_add_release(ptr %a) { ; O0-NEXT: jmp .LBB223_1 ; O0-NEXT: .LBB223_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: movq %rax, %rbx ; O0-NEXT: .Lpcsection569: @@ -15670,8 +15690,8 @@ define void @atomic128_add_release(ptr %a) { ; O0-NEXT: adcq $0, %rcx ; O0-NEXT: .Lpcsection571: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection572: ; O0-NEXT: jne .LBB223_1 ; O0-NEXT: jmp .LBB223_2 @@ -15821,8 +15841,8 @@ define void @atomic128_sub_release(ptr %a) { ; O0-NEXT: jmp .LBB224_1 ; O0-NEXT: .LBB224_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: movq %rax, %rbx ; O0-NEXT: .Lpcsection576: @@ -15832,8 +15852,8 @@ define void @atomic128_sub_release(ptr %a) { ; O0-NEXT: adcq $-1, %rcx ; O0-NEXT: .Lpcsection578: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection579: ; O0-NEXT: jne .LBB224_1 ; O0-NEXT: jmp .LBB224_2 @@ -15983,8 +16003,8 @@ define void @atomic128_and_release(ptr %a) { ; O0-NEXT: jmp .LBB225_1 ; O0-NEXT: .LBB225_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: movl %eax, %ecx ; O0-NEXT: .Lpcsection583: @@ -15996,8 +16016,8 @@ define void @atomic128_and_release(ptr %a) { ; O0-NEXT: # kill: def $rcx killed $ecx ; O0-NEXT: .Lpcsection586: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection587: ; O0-NEXT: jne .LBB225_1 ; O0-NEXT: jmp .LBB225_2 @@ -16143,8 +16163,8 @@ define void @atomic128_or_release(ptr %a) { ; O0-NEXT: jmp .LBB226_1 ; O0-NEXT: .LBB226_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: movq %rax, %rbx ; O0-NEXT: .Lpcsection591: @@ -16152,8 +16172,8 @@ define void @atomic128_or_release(ptr %a) { ; O0-NEXT: movq %rcx, %rdx ; O0-NEXT: .Lpcsection592: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection593: ; O0-NEXT: jne .LBB226_1 ; O0-NEXT: jmp .LBB226_2 @@ -16295,8 +16315,8 @@ define void @atomic128_xor_release(ptr %a) { ; O0-NEXT: jmp .LBB227_1 ; O0-NEXT: .LBB227_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: movq %rax, %rbx ; O0-NEXT: .Lpcsection597: @@ -16304,8 +16324,8 @@ define void @atomic128_xor_release(ptr %a) { ; O0-NEXT: movq %rcx, %rdx ; O0-NEXT: .Lpcsection598: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection599: ; O0-NEXT: jne .LBB227_1 ; O0-NEXT: jmp .LBB227_2 @@ -16447,8 +16467,8 @@ define void @atomic128_nand_release(ptr %a) { ; O0-NEXT: jmp .LBB228_1 ; O0-NEXT: .LBB228_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: movl %eax, %ecx ; O0-NEXT: .Lpcsection603: @@ -16462,8 +16482,8 @@ define void @atomic128_nand_release(ptr %a) { ; O0-NEXT: movq $-1, %rcx ; O0-NEXT: .Lpcsection607: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection608: ; O0-NEXT: jne .LBB228_1 ; O0-NEXT: jmp .LBB228_2 @@ -16617,8 +16637,8 @@ define void @atomic128_xchg_acq_rel(ptr %a) { ; O0-NEXT: jmp .LBB229_1 ; O0-NEXT: .LBB229_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: .Lpcsection612: ; O0-NEXT: xorl %ecx, %ecx @@ -16628,8 +16648,8 @@ define void @atomic128_xchg_acq_rel(ptr %a) { ; O0-NEXT: movl $42, %ebx ; O0-NEXT: .Lpcsection615: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection616: ; O0-NEXT: jne .LBB229_1 ; O0-NEXT: jmp .LBB229_2 @@ -16771,8 +16791,8 @@ define void @atomic128_add_acq_rel(ptr %a) { ; O0-NEXT: jmp .LBB230_1 ; O0-NEXT: .LBB230_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: movq %rax, %rbx ; O0-NEXT: .Lpcsection620: @@ -16782,8 +16802,8 @@ define void @atomic128_add_acq_rel(ptr %a) { ; O0-NEXT: adcq $0, %rcx ; O0-NEXT: .Lpcsection622: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection623: ; O0-NEXT: jne .LBB230_1 ; O0-NEXT: jmp .LBB230_2 @@ -16933,8 +16953,8 @@ define void @atomic128_sub_acq_rel(ptr %a) { ; O0-NEXT: jmp .LBB231_1 ; O0-NEXT: .LBB231_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: movq %rax, %rbx ; O0-NEXT: .Lpcsection627: @@ -16944,8 +16964,8 @@ define void @atomic128_sub_acq_rel(ptr %a) { ; O0-NEXT: adcq $-1, %rcx ; O0-NEXT: .Lpcsection629: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection630: ; O0-NEXT: jne .LBB231_1 ; O0-NEXT: jmp .LBB231_2 @@ -17095,8 +17115,8 @@ define void @atomic128_and_acq_rel(ptr %a) { ; O0-NEXT: jmp .LBB232_1 ; O0-NEXT: .LBB232_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: movl %eax, %ecx ; O0-NEXT: .Lpcsection634: @@ -17108,8 +17128,8 @@ define void @atomic128_and_acq_rel(ptr %a) { ; O0-NEXT: # kill: def $rcx killed $ecx ; O0-NEXT: .Lpcsection637: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection638: ; O0-NEXT: jne .LBB232_1 ; O0-NEXT: jmp .LBB232_2 @@ -17255,8 +17275,8 @@ define void @atomic128_or_acq_rel(ptr %a) { ; O0-NEXT: jmp .LBB233_1 ; O0-NEXT: .LBB233_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: movq %rax, %rbx ; O0-NEXT: .Lpcsection642: @@ -17264,8 +17284,8 @@ define void @atomic128_or_acq_rel(ptr %a) { ; O0-NEXT: movq %rcx, %rdx ; O0-NEXT: .Lpcsection643: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection644: ; O0-NEXT: jne .LBB233_1 ; O0-NEXT: jmp .LBB233_2 @@ -17407,8 +17427,8 @@ define void @atomic128_xor_acq_rel(ptr %a) { ; O0-NEXT: jmp .LBB234_1 ; O0-NEXT: .LBB234_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: movq %rax, %rbx ; O0-NEXT: .Lpcsection648: @@ -17416,8 +17436,8 @@ define void @atomic128_xor_acq_rel(ptr %a) { ; O0-NEXT: movq %rcx, %rdx ; O0-NEXT: .Lpcsection649: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection650: ; O0-NEXT: jne .LBB234_1 ; O0-NEXT: jmp .LBB234_2 @@ -17559,8 +17579,8 @@ define void @atomic128_nand_acq_rel(ptr %a) { ; O0-NEXT: jmp .LBB235_1 ; O0-NEXT: .LBB235_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: movl %eax, %ecx ; O0-NEXT: .Lpcsection654: @@ -17574,8 +17594,8 @@ define void @atomic128_nand_acq_rel(ptr %a) { ; O0-NEXT: movq $-1, %rcx ; O0-NEXT: .Lpcsection658: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection659: ; O0-NEXT: jne .LBB235_1 ; O0-NEXT: jmp .LBB235_2 @@ -17729,8 +17749,8 @@ define void @atomic128_xchg_seq_cst(ptr %a) { ; O0-NEXT: jmp .LBB236_1 ; O0-NEXT: .LBB236_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: .Lpcsection663: ; O0-NEXT: xorl %ecx, %ecx @@ -17740,8 +17760,8 @@ define void @atomic128_xchg_seq_cst(ptr %a) { ; O0-NEXT: movl $42, %ebx ; O0-NEXT: .Lpcsection666: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection667: ; O0-NEXT: jne .LBB236_1 ; O0-NEXT: jmp .LBB236_2 @@ -17883,8 +17903,8 @@ define void @atomic128_add_seq_cst(ptr %a) { ; O0-NEXT: jmp .LBB237_1 ; O0-NEXT: .LBB237_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: movq %rax, %rbx ; O0-NEXT: .Lpcsection671: @@ -17894,8 +17914,8 @@ define void @atomic128_add_seq_cst(ptr %a) { ; O0-NEXT: adcq $0, %rcx ; O0-NEXT: .Lpcsection673: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection674: ; O0-NEXT: jne .LBB237_1 ; O0-NEXT: jmp .LBB237_2 @@ -18045,8 +18065,8 @@ define void @atomic128_sub_seq_cst(ptr %a) { ; O0-NEXT: jmp .LBB238_1 ; O0-NEXT: .LBB238_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: movq %rax, %rbx ; O0-NEXT: .Lpcsection678: @@ -18056,8 +18076,8 @@ define void @atomic128_sub_seq_cst(ptr %a) { ; O0-NEXT: adcq $-1, %rcx ; O0-NEXT: .Lpcsection680: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection681: ; O0-NEXT: jne .LBB238_1 ; O0-NEXT: jmp .LBB238_2 @@ -18207,8 +18227,8 @@ define void @atomic128_and_seq_cst(ptr %a) { ; O0-NEXT: jmp .LBB239_1 ; O0-NEXT: .LBB239_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: movl %eax, %ecx ; O0-NEXT: .Lpcsection685: @@ -18220,8 +18240,8 @@ define void @atomic128_and_seq_cst(ptr %a) { ; O0-NEXT: # kill: def $rcx killed $ecx ; O0-NEXT: .Lpcsection688: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection689: ; O0-NEXT: jne .LBB239_1 ; O0-NEXT: jmp .LBB239_2 @@ -18367,8 +18387,8 @@ define void @atomic128_or_seq_cst(ptr %a) { ; O0-NEXT: jmp .LBB240_1 ; O0-NEXT: .LBB240_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: movq %rax, %rbx ; O0-NEXT: .Lpcsection693: @@ -18376,8 +18396,8 @@ define void @atomic128_or_seq_cst(ptr %a) { ; O0-NEXT: movq %rcx, %rdx ; O0-NEXT: .Lpcsection694: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection695: ; O0-NEXT: jne .LBB240_1 ; O0-NEXT: jmp .LBB240_2 @@ -18519,8 +18539,8 @@ define void @atomic128_xor_seq_cst(ptr %a) { ; O0-NEXT: jmp .LBB241_1 ; O0-NEXT: .LBB241_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: movq %rax, %rbx ; O0-NEXT: .Lpcsection699: @@ -18528,8 +18548,8 @@ define void @atomic128_xor_seq_cst(ptr %a) { ; O0-NEXT: movq %rcx, %rdx ; O0-NEXT: .Lpcsection700: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection701: ; O0-NEXT: jne .LBB241_1 ; O0-NEXT: jmp .LBB241_2 @@ -18671,8 +18691,8 @@ define void @atomic128_nand_seq_cst(ptr %a) { ; O0-NEXT: jmp .LBB242_1 ; O0-NEXT: .LBB242_1: # %atomicrmw.start ; O0-NEXT: # =>This Inner Loop Header: Depth=1 -; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; O0-NEXT: movl %eax, %ecx ; O0-NEXT: .Lpcsection705: @@ -18686,8 +18706,8 @@ define void @atomic128_nand_seq_cst(ptr %a) { ; O0-NEXT: movq $-1, %rcx ; O0-NEXT: .Lpcsection709: ; O0-NEXT: lock cmpxchg16b (%rsi) -; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; O0-NEXT: .Lpcsection710: ; O0-NEXT: jne .LBB242_1 ; O0-NEXT: jmp .LBB242_2 diff --git a/llvm/test/CodeGen/X86/pr15705.ll b/llvm/test/CodeGen/X86/pr15705.ll index 3dd4aabe25b57..bbbeb53764686 100644 --- a/llvm/test/CodeGen/X86/pr15705.ll +++ b/llvm/test/CodeGen/X86/pr15705.ll @@ -5,18 +5,19 @@ define i32 @PR15705(i32 %x, i32 %a, i32 %b, i32 %c) #0 { ; X86-LABEL: PR15705: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmpl %ecx, %edx -; X86-NEXT: je .LBB0_4 -; X86-NEXT: # %bb.1: # %if.end ; X86-NEXT: cmpl %eax, %edx -; X86-NEXT: jne .LBB0_3 -; X86-NEXT: # %bb.2: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: .LBB0_3: # %if.end +; X86-NEXT: jne .LBB0_2 +; X86-NEXT: # %bb.1: ; X86-NEXT: movl %ecx, %eax +; X86-NEXT: retl +; X86-NEXT: .LBB0_2: # %if.end +; X86-NEXT: cmpl %ecx, %edx +; X86-NEXT: jne .LBB0_4 +; X86-NEXT: # %bb.3: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: .LBB0_4: # %return ; X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/pr32256.ll b/llvm/test/CodeGen/X86/pr32256.ll index 225a3af551a2c..09f7d92c1db2e 100644 --- a/llvm/test/CodeGen/X86/pr32256.ll +++ b/llvm/test/CodeGen/X86/pr32256.ll @@ -9,12 +9,12 @@ define void @_Z1av() { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subl $2, %esp ; CHECK-NEXT: .cfi_def_cfa_offset 6 -; CHECK-NEXT: movb c, %cl -; CHECK-NEXT: xorb $-1, %cl -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: # kill: def $al killed $al killed $eax -; CHECK-NEXT: testb $1, %cl -; CHECK-NEXT: movb %al, (%esp) # 1-byte Spill +; CHECK-NEXT: movb c, %al +; CHECK-NEXT: xorb $-1, %al +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NEXT: movb %cl, (%esp) # 1-byte Spill +; CHECK-NEXT: testb $1, %al ; CHECK-NEXT: jne .LBB0_1 ; CHECK-NEXT: jmp .LBB0_2 ; CHECK-NEXT: .LBB0_1: # %land.rhs diff --git a/llvm/test/CodeGen/X86/pr38795.ll b/llvm/test/CodeGen/X86/pr38795.ll index c3c96e8228797..02dbe4f545fe5 100644 --- a/llvm/test/CodeGen/X86/pr38795.ll +++ b/llvm/test/CodeGen/X86/pr38795.ll @@ -33,6 +33,7 @@ define dso_local void @fn() { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_15: # %for.inc ; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: movl %esi, %ecx ; CHECK-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; CHECK-NEXT: .LBB0_1: # %for.cond ; CHECK-NEXT: # =>This Loop Header: Depth=1 @@ -69,7 +70,6 @@ define dso_local void @fn() { ; CHECK-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %dh # 1-byte Reload ; CHECK-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Reload ; CHECK-NEXT: testb %bl, %bl -; CHECK-NEXT: movl %esi, %ecx ; CHECK-NEXT: # implicit-def: $eax ; CHECK-NEXT: movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; CHECK-NEXT: jne .LBB0_15 @@ -119,7 +119,7 @@ define dso_local void @fn() { ; CHECK-NEXT: jne .LBB0_9 ; CHECK-NEXT: # %bb.12: # %if.end26 ; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: testb %dh, %dh ; CHECK-NEXT: je .LBB0_15 ; CHECK-NEXT: # %bb.13: # %if.end26 @@ -128,7 +128,7 @@ define dso_local void @fn() { ; CHECK-NEXT: jne .LBB0_15 ; CHECK-NEXT: # %bb.14: # %if.then31 ; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: xorl %ebp, %ebp ; CHECK-NEXT: jmp .LBB0_15 ; CHECK-NEXT: .p2align 4 @@ -279,31 +279,34 @@ define void @verifier_error_reduced_issue38788(i1 %cmp11) { ; CHECK-NEXT: je .LBB1_3 ; CHECK-NEXT: # %bb.2: # in Loop: Header=BB1_1 Depth=1 ; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: movl %ebx, %edx ; CHECK-NEXT: jmp .LBB1_5 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB1_3: # %if.end ; CHECK-NEXT: # in Loop: Header=BB1_1 Depth=1 +; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: testb $1, %al ; CHECK-NEXT: je .LBB1_4 ; CHECK-NEXT: # %bb.9: # %if.then13 ; CHECK-NEXT: # in Loop: Header=BB1_1 Depth=1 -; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: testb $1, %al ; CHECK-NEXT: movl %ebx, %eax ; CHECK-NEXT: movl $0, %ebx ; CHECK-NEXT: jne .LBB1_8 +; CHECK-NEXT: # %bb.10: # %for.cond35 +; CHECK-NEXT: # in Loop: Header=BB1_1 Depth=1 +; CHECK-NEXT: movl %ebx, %edx ; CHECK-NEXT: jmp .LBB1_5 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB1_4: # in Loop: Header=BB1_1 Depth=1 ; CHECK-NEXT: movl %ebx, %eax -; CHECK-NEXT: xorl %ebx, %ebx ; CHECK-NEXT: .LBB1_5: # %if.end26 ; CHECK-NEXT: # in Loop: Header=BB1_1 Depth=1 ; CHECK-NEXT: testb %cl, %cl ; CHECK-NEXT: je .LBB1_7 ; CHECK-NEXT: # %bb.6: # %if.end26 ; CHECK-NEXT: # in Loop: Header=BB1_1 Depth=1 -; CHECK-NEXT: movl %ebx, %ecx +; CHECK-NEXT: movl %edx, %ecx ; CHECK-NEXT: jmp .LBB1_7 entry: br label %for.cond diff --git a/llvm/test/CodeGen/X86/pr49451.ll b/llvm/test/CodeGen/X86/pr49451.ll index 173c41140ebef..1a7551f6117e8 100644 --- a/llvm/test/CodeGen/X86/pr49451.ll +++ b/llvm/test/CodeGen/X86/pr49451.ll @@ -18,15 +18,15 @@ define void @func_6(i8 %uc_8, i64 %uli_10) nounwind { ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB0_1: # %for.body612 ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: testb %dl, %dl +; X86-NEXT: testb %bl, %bl ; X86-NEXT: je .LBB0_2 ; X86-NEXT: # %bb.3: # %if.end1401 ; X86-NEXT: # in Loop: Header=BB0_1 Depth=1 ; X86-NEXT: addl %eax, %esi ; X86-NEXT: movw %si, s_2 -; X86-NEXT: movw %bx, s_0 +; X86-NEXT: movw %dx, s_0 ; X86-NEXT: incl %ecx -; X86-NEXT: incl %ebx +; X86-NEXT: incl %edx ; X86-NEXT: cmpw $73, %cx ; X86-NEXT: jl .LBB0_1 ; X86-NEXT: # %bb.4: # %for.body1703 diff --git a/llvm/test/CodeGen/X86/pr63108.ll b/llvm/test/CodeGen/X86/pr63108.ll index b5b80515fc6d9..8df90a935314d 100644 --- a/llvm/test/CodeGen/X86/pr63108.ll +++ b/llvm/test/CodeGen/X86/pr63108.ll @@ -21,7 +21,7 @@ define i32 @PR63108() { ; SSE-NEXT: # =>This Inner Loop Header: Depth=1 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: testb %al, %al -; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: jne .LBB0_3 ; SSE-NEXT: # %bb.4: # %middle.block ; SSE-NEXT: pxor %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll index 7364b15045b40..dc546541d714d 100644 --- a/llvm/test/CodeGen/X86/sad.ll +++ b/llvm/test/CodeGen/X86/sad.ll @@ -151,27 +151,27 @@ define dso_local i32 @sad_32i8() nounwind { ; SSE2-LABEL: sad_32i8: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 ; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: .p2align 4 ; SSE2-NEXT: .LBB1_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 ; SSE2-NEXT: movdqa a+1024(%rax), %xmm3 ; SSE2-NEXT: psadbw b+1024(%rax), %xmm3 -; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: paddd %xmm3, %xmm2 ; SSE2-NEXT: movdqa a+1040(%rax), %xmm3 ; SSE2-NEXT: psadbw b+1040(%rax), %xmm3 -; SSE2-NEXT: paddd %xmm3, %xmm2 +; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: addq $32, %rax ; SSE2-NEXT: jne .LBB1_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] @@ -356,9 +356,9 @@ define dso_local i32 @sad_avx64i8() nounwind { ; AVX1-LABEL: sad_avx64i8: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: .p2align 4 ; AVX1-NEXT: .LBB2_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 @@ -408,26 +408,26 @@ define dso_local i32 @sad_avx64i8() nounwind { ; AVX2-LABEL: sad_avx64i8: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: .p2align 4 ; AVX2-NEXT: .LBB2_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX2-NEXT: vmovdqa a+1024(%rax), %ymm3 ; AVX2-NEXT: vpsadbw b+1024(%rax), %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vmovdqa a+1056(%rax), %ymm3 ; AVX2-NEXT: vpsadbw b+1056(%rax), %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: addq $64, %rax ; AVX2-NEXT: jne .LBB2_1 ; AVX2-NEXT: # %bb.2: # %middle.block -; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm2 +; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm3 -; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpaddd %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll index 325f735b09cd9..b4a6960d144e1 100644 --- a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll +++ b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll @@ -1266,13 +1266,12 @@ define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, ; X86-SSE-LABEL: add_ss_mask: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: testb $1, {{[0-9]+}}(%esp) -; X86-SSE-NEXT: jne .LBB70_1 -; X86-SSE-NEXT: # %bb.2: -; X86-SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; X86-SSE-NEXT: retl -; X86-SSE-NEXT: .LBB70_1: +; X86-SSE-NEXT: je .LBB70_2 +; X86-SSE-NEXT: # %bb.1: ; X86-SSE-NEXT: addss %xmm0, %xmm1 -; X86-SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X86-SSE-NEXT: movaps %xmm1, %xmm2 +; X86-SSE-NEXT: .LBB70_2: +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] ; X86-SSE-NEXT: retl ; ; X86-AVX1-LABEL: add_ss_mask: @@ -1296,13 +1295,12 @@ define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, ; X64-SSE-LABEL: add_ss_mask: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: testb $1, %dil -; X64-SSE-NEXT: jne .LBB70_1 -; X64-SSE-NEXT: # %bb.2: -; X64-SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; X64-SSE-NEXT: retq -; X64-SSE-NEXT: .LBB70_1: +; X64-SSE-NEXT: je .LBB70_2 +; X64-SSE-NEXT: # %bb.1: ; X64-SSE-NEXT: addss %xmm0, %xmm1 -; X64-SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X64-SSE-NEXT: movaps %xmm1, %xmm2 +; X64-SSE-NEXT: .LBB70_2: +; X64-SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] ; X64-SSE-NEXT: retq ; ; X64-AVX1-LABEL: add_ss_mask: @@ -1336,13 +1334,12 @@ define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> ; X86-SSE-LABEL: add_sd_mask: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: testb $1, {{[0-9]+}}(%esp) -; X86-SSE-NEXT: jne .LBB71_1 -; X86-SSE-NEXT: # %bb.2: -; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; X86-SSE-NEXT: retl -; X86-SSE-NEXT: .LBB71_1: +; X86-SSE-NEXT: je .LBB71_2 +; X86-SSE-NEXT: # %bb.1: ; X86-SSE-NEXT: addsd %xmm0, %xmm1 -; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X86-SSE-NEXT: movapd %xmm1, %xmm2 +; X86-SSE-NEXT: .LBB71_2: +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; X86-SSE-NEXT: retl ; ; X86-AVX1-LABEL: add_sd_mask: @@ -1366,13 +1363,12 @@ define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> ; X64-SSE-LABEL: add_sd_mask: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: testb $1, %dil -; X64-SSE-NEXT: jne .LBB71_1 -; X64-SSE-NEXT: # %bb.2: -; X64-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; X64-SSE-NEXT: retq -; X64-SSE-NEXT: .LBB71_1: +; X64-SSE-NEXT: je .LBB71_2 +; X64-SSE-NEXT: # %bb.1: ; X64-SSE-NEXT: addsd %xmm0, %xmm1 -; X64-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X64-SSE-NEXT: movapd %xmm1, %xmm2 +; X64-SSE-NEXT: .LBB71_2: +; X64-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; X64-SSE-NEXT: retq ; ; X64-AVX1-LABEL: add_sd_mask: diff --git a/llvm/test/CodeGen/X86/statepoint-cmp-sunk-past-statepoint.ll b/llvm/test/CodeGen/X86/statepoint-cmp-sunk-past-statepoint.ll index 8d589c519eff2..0200f1e34e8eb 100644 --- a/llvm/test/CodeGen/X86/statepoint-cmp-sunk-past-statepoint.ll +++ b/llvm/test/CodeGen/X86/statepoint-cmp-sunk-past-statepoint.ll @@ -83,15 +83,13 @@ define void @test2(ptr addrspace(1) %this, i32 %0, ptr addrspace(1) %p0, ptr add ; CHECK-LV-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY killed $rcx ; CHECK-LV-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY killed $rdx ; CHECK-LV-NEXT: [[MOV32r0_:%[0-9]+]]:gr32 = MOV32r0 implicit-def dead $eflags - ; CHECK-LV-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY killed [[COPY1]] - ; CHECK-LV-NEXT: [[COPY3:%[0-9]+]]:gr64 = COPY killed [[COPY]] ; CHECK-LV-NEXT: {{ $}} ; CHECK-LV-NEXT: bb.1.loop.head: ; CHECK-LV-NEXT: successors: %bb.6(0x04000000), %bb.2(0x7c000000) ; CHECK-LV-NEXT: {{ $}} - ; CHECK-LV-NEXT: [[COPY4:%[0-9]+]]:gr64 = COPY killed [[COPY3]] - ; CHECK-LV-NEXT: [[COPY5:%[0-9]+]]:gr64 = COPY killed [[COPY2]] - ; CHECK-LV-NEXT: TEST64rr killed [[COPY5]], [[COPY5]], implicit-def $eflags + ; CHECK-LV-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY killed [[COPY]] + ; CHECK-LV-NEXT: [[COPY3:%[0-9]+]]:gr64 = COPY killed [[COPY1]] + ; CHECK-LV-NEXT: TEST64rr killed [[COPY3]], [[COPY3]], implicit-def $eflags ; CHECK-LV-NEXT: JCC_1 %bb.6, 5, implicit killed $eflags ; CHECK-LV-NEXT: JMP_1 %bb.2 ; CHECK-LV-NEXT: {{ $}} @@ -99,12 +97,12 @@ define void @test2(ptr addrspace(1) %this, i32 %0, ptr addrspace(1) %p0, ptr add ; CHECK-LV-NEXT: successors: %bb.3(0x7ffff800), %bb.7(0x00000800) ; CHECK-LV-NEXT: {{ $}} ; CHECK-LV-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp - ; CHECK-LV-NEXT: [[COPY4:%[0-9]+]]:gr64 = STATEPOINT 2882400000, 0, 0, undef %11:gr64, 2, 0, 2, 0, 2, 0, 2, 1, [[COPY4]](tied-def 0), 2, 0, 2, 1, 0, 0, csr_64, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax + ; CHECK-LV-NEXT: [[COPY2:%[0-9]+]]:gr64 = STATEPOINT 2882400000, 0, 0, undef %11:gr64, 2, 0, 2, 0, 2, 0, 2, 1, [[COPY2]](tied-def 0), 2, 0, 2, 1, 0, 0, csr_64, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax ; CHECK-LV-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-LV-NEXT: EH_LABEL ; CHECK-LV-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp - ; CHECK-LV-NEXT: [[COPY6:%[0-9]+]]:gr64 = COPY [[COPY4]] - ; CHECK-LV-NEXT: [[COPY6:%[0-9]+]]:gr64 = STATEPOINT 2882400000, 0, 0, undef %13:gr64, 2, 0, 2, 0, 2, 0, 2, 1, [[COPY6]](tied-def 0), 2, 0, 2, 1, 0, 0, csr_64, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax + ; CHECK-LV-NEXT: [[COPY4:%[0-9]+]]:gr64 = COPY [[COPY2]] + ; CHECK-LV-NEXT: [[COPY4:%[0-9]+]]:gr64 = STATEPOINT 2882400000, 0, 0, undef %13:gr64, 2, 0, 2, 0, 2, 0, 2, 1, [[COPY4]](tied-def 0), 2, 0, 2, 1, 0, 0, csr_64, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax ; CHECK-LV-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-LV-NEXT: EH_LABEL ; CHECK-LV-NEXT: JMP_1 %bb.3 @@ -112,8 +110,8 @@ define void @test2(ptr addrspace(1) %this, i32 %0, ptr addrspace(1) %p0, ptr add ; CHECK-LV-NEXT: bb.3.BB2: ; CHECK-LV-NEXT: successors: %bb.4(0x40000000), %bb.5(0x40000000) ; CHECK-LV-NEXT: {{ $}} - ; CHECK-LV-NEXT: [[COPY7:%[0-9]+]]:gr8 = COPY [[MOV32r0_]].sub_8bit - ; CHECK-LV-NEXT: TEST8rr killed [[COPY7]], [[COPY7]], implicit-def $eflags + ; CHECK-LV-NEXT: [[COPY5:%[0-9]+]]:gr8 = COPY [[MOV32r0_]].sub_8bit + ; CHECK-LV-NEXT: TEST8rr killed [[COPY5]], [[COPY5]], implicit-def $eflags ; CHECK-LV-NEXT: JCC_1 %bb.5, 5, implicit killed $eflags ; CHECK-LV-NEXT: JMP_1 %bb.4 ; CHECK-LV-NEXT: {{ $}} @@ -123,12 +121,11 @@ define void @test2(ptr addrspace(1) %this, i32 %0, ptr addrspace(1) %p0, ptr add ; CHECK-LV-NEXT: bb.5.tail: ; CHECK-LV-NEXT: successors: %bb.6(0x04000000), %bb.1(0x7c000000) ; CHECK-LV-NEXT: {{ $}} - ; CHECK-LV-NEXT: [[COPY8:%[0-9]+]]:gr64 = COPY [[COPY6]] - ; CHECK-LV-NEXT: [[COPY9:%[0-9]+]]:gr64 = COPY killed [[COPY6]] - ; CHECK-LV-NEXT: [[COPY9:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY9]], 8, implicit-def dead $eflags - ; CHECK-LV-NEXT: TEST64rr killed [[COPY4]], [[COPY4]], implicit-def $eflags - ; CHECK-LV-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY killed [[COPY9]] - ; CHECK-LV-NEXT: [[COPY3:%[0-9]+]]:gr64 = COPY killed [[COPY8]] + ; CHECK-LV-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY [[COPY4]] + ; CHECK-LV-NEXT: [[COPY6:%[0-9]+]]:gr64 = COPY killed [[COPY4]] + ; CHECK-LV-NEXT: [[COPY6:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY6]], 8, implicit-def dead $eflags + ; CHECK-LV-NEXT: TEST64rr killed [[COPY2]], [[COPY2]], implicit-def $eflags + ; CHECK-LV-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY killed [[COPY6]] ; CHECK-LV-NEXT: JCC_1 %bb.1, 5, implicit killed $eflags ; CHECK-LV-NEXT: JMP_1 %bb.6 ; CHECK-LV-NEXT: {{ $}} @@ -149,15 +146,13 @@ define void @test2(ptr addrspace(1) %this, i32 %0, ptr addrspace(1) %p0, ptr add ; CHECK-LIS-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rcx ; CHECK-LIS-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rdx ; CHECK-LIS-NEXT: [[MOV32r0_:%[0-9]+]]:gr32 = MOV32r0 implicit-def dead $eflags - ; CHECK-LIS-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY [[COPY1]] - ; CHECK-LIS-NEXT: [[COPY3:%[0-9]+]]:gr64 = COPY [[COPY]] ; CHECK-LIS-NEXT: {{ $}} ; CHECK-LIS-NEXT: bb.1.loop.head: ; CHECK-LIS-NEXT: successors: %bb.6(0x04000000), %bb.2(0x7c000000) ; CHECK-LIS-NEXT: {{ $}} - ; CHECK-LIS-NEXT: [[COPY4:%[0-9]+]]:gr64 = COPY [[COPY3]] - ; CHECK-LIS-NEXT: [[COPY5:%[0-9]+]]:gr64 = COPY [[COPY2]] - ; CHECK-LIS-NEXT: TEST64rr [[COPY5]], [[COPY5]], implicit-def $eflags + ; CHECK-LIS-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY [[COPY]] + ; CHECK-LIS-NEXT: [[COPY3:%[0-9]+]]:gr64 = COPY [[COPY1]] + ; CHECK-LIS-NEXT: TEST64rr [[COPY3]], [[COPY3]], implicit-def $eflags ; CHECK-LIS-NEXT: JCC_1 %bb.6, 5, implicit killed $eflags ; CHECK-LIS-NEXT: JMP_1 %bb.2 ; CHECK-LIS-NEXT: {{ $}} @@ -165,12 +160,12 @@ define void @test2(ptr addrspace(1) %this, i32 %0, ptr addrspace(1) %p0, ptr add ; CHECK-LIS-NEXT: successors: %bb.3(0x7ffff800), %bb.7(0x00000800) ; CHECK-LIS-NEXT: {{ $}} ; CHECK-LIS-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp - ; CHECK-LIS-NEXT: [[COPY4:%[0-9]+]]:gr64 = STATEPOINT 2882400000, 0, 0, undef %11:gr64, 2, 0, 2, 0, 2, 0, 2, 1, [[COPY4]](tied-def 0), 2, 0, 2, 1, 0, 0, csr_64, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax + ; CHECK-LIS-NEXT: [[COPY2:%[0-9]+]]:gr64 = STATEPOINT 2882400000, 0, 0, undef %11:gr64, 2, 0, 2, 0, 2, 0, 2, 1, [[COPY2]](tied-def 0), 2, 0, 2, 1, 0, 0, csr_64, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax ; CHECK-LIS-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-LIS-NEXT: EH_LABEL ; CHECK-LIS-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp - ; CHECK-LIS-NEXT: [[COPY6:%[0-9]+]]:gr64 = COPY [[COPY4]] - ; CHECK-LIS-NEXT: [[COPY6:%[0-9]+]]:gr64 = STATEPOINT 2882400000, 0, 0, undef %13:gr64, 2, 0, 2, 0, 2, 0, 2, 1, [[COPY6]](tied-def 0), 2, 0, 2, 1, 0, 0, csr_64, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax + ; CHECK-LIS-NEXT: [[COPY4:%[0-9]+]]:gr64 = COPY [[COPY2]] + ; CHECK-LIS-NEXT: [[COPY4:%[0-9]+]]:gr64 = STATEPOINT 2882400000, 0, 0, undef %13:gr64, 2, 0, 2, 0, 2, 0, 2, 1, [[COPY4]](tied-def 0), 2, 0, 2, 1, 0, 0, csr_64, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax ; CHECK-LIS-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-LIS-NEXT: EH_LABEL ; CHECK-LIS-NEXT: JMP_1 %bb.3 @@ -178,8 +173,8 @@ define void @test2(ptr addrspace(1) %this, i32 %0, ptr addrspace(1) %p0, ptr add ; CHECK-LIS-NEXT: bb.3.BB2: ; CHECK-LIS-NEXT: successors: %bb.4(0x40000000), %bb.5(0x40000000) ; CHECK-LIS-NEXT: {{ $}} - ; CHECK-LIS-NEXT: [[COPY7:%[0-9]+]]:gr8 = COPY [[MOV32r0_]].sub_8bit - ; CHECK-LIS-NEXT: TEST8rr [[COPY7]], [[COPY7]], implicit-def $eflags + ; CHECK-LIS-NEXT: [[COPY5:%[0-9]+]]:gr8 = COPY [[MOV32r0_]].sub_8bit + ; CHECK-LIS-NEXT: TEST8rr [[COPY5]], [[COPY5]], implicit-def $eflags ; CHECK-LIS-NEXT: JCC_1 %bb.5, 5, implicit killed $eflags ; CHECK-LIS-NEXT: JMP_1 %bb.4 ; CHECK-LIS-NEXT: {{ $}} @@ -189,12 +184,11 @@ define void @test2(ptr addrspace(1) %this, i32 %0, ptr addrspace(1) %p0, ptr add ; CHECK-LIS-NEXT: bb.5.tail: ; CHECK-LIS-NEXT: successors: %bb.6(0x04000000), %bb.1(0x7c000000) ; CHECK-LIS-NEXT: {{ $}} - ; CHECK-LIS-NEXT: [[COPY8:%[0-9]+]]:gr64 = COPY [[COPY6]] - ; CHECK-LIS-NEXT: [[COPY9:%[0-9]+]]:gr64 = COPY killed [[COPY6]] - ; CHECK-LIS-NEXT: [[COPY9:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY9]], 8, implicit-def dead $eflags - ; CHECK-LIS-NEXT: TEST64rr killed [[COPY4]], [[COPY4]], implicit-def $eflags - ; CHECK-LIS-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY [[COPY9]] - ; CHECK-LIS-NEXT: [[COPY3:%[0-9]+]]:gr64 = COPY [[COPY8]] + ; CHECK-LIS-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY [[COPY4]] + ; CHECK-LIS-NEXT: [[COPY6:%[0-9]+]]:gr64 = COPY killed [[COPY4]] + ; CHECK-LIS-NEXT: [[COPY6:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY6]], 8, implicit-def dead $eflags + ; CHECK-LIS-NEXT: TEST64rr killed [[COPY2]], [[COPY2]], implicit-def $eflags + ; CHECK-LIS-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY [[COPY6]] ; CHECK-LIS-NEXT: JCC_1 %bb.1, 5, implicit killed $eflags ; CHECK-LIS-NEXT: JMP_1 %bb.6 ; CHECK-LIS-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/X86/swifterror.ll b/llvm/test/CodeGen/X86/swifterror.ll index 5699c447baf41..473cc9d152627 100644 --- a/llvm/test/CodeGen/X86/swifterror.ll +++ b/llvm/test/CodeGen/X86/swifterror.ll @@ -426,18 +426,19 @@ define float @foo_loop(ptr swifterror %error_ptr_ref, i32 %cc, float %cc2) { ; CHECK-O0-NEXT: jmp LBB4_1 ; CHECK-O0-NEXT: LBB4_1: ## %bb_loop ; CHECK-O0-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 4-byte Reload -; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload -; CHECK-O0-NEXT: cmpl $0, %ecx -; CHECK-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; CHECK-O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload +; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload +; CHECK-O0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; CHECK-O0-NEXT: cmpl $0, %eax ; CHECK-O0-NEXT: je LBB4_3 ; CHECK-O0-NEXT: ## %bb.2: ## %gen_error ; CHECK-O0-NEXT: ## in Loop: Header=BB4_1 Depth=1 ; CHECK-O0-NEXT: movl $16, %edi ; CHECK-O0-NEXT: callq _malloc ; CHECK-O0-NEXT: movq %rax, %rcx -; CHECK-O0-NEXT: movb $1, 8(%rcx) -; CHECK-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; CHECK-O0-NEXT: movq %rcx, %rax +; CHECK-O0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; CHECK-O0-NEXT: movb $1, 8(%rax) ; CHECK-O0-NEXT: LBB4_3: ## %bb_cont ; CHECK-O0-NEXT: ## in Loop: Header=BB4_1 Depth=1 ; CHECK-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Reload @@ -964,17 +965,17 @@ define void @swifterror_isel(ptr) { ; CHECK-O0-NEXT: ## implicit-def: $r12 ; CHECK-O0-NEXT: jne LBB8_2 ; CHECK-O0-NEXT: LBB8_1: ## =>This Inner Loop Header: Depth=1 -; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 ## 8-byte Reload ; CHECK-O0-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax ## 2-byte Reload +; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 ## 8-byte Reload ; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 ## 8-byte Reload ; CHECK-O0-NEXT: ## implicit-def: $edi ; CHECK-O0-NEXT: movw %ax, %di ; CHECK-O0-NEXT: ## implicit-def: $rax ; CHECK-O0-NEXT: callq *%rax +; CHECK-O0-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-O0-NEXT: ## implicit-def: $rax ; CHECK-O0-NEXT: movw (%rax), %ax ; CHECK-O0-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; CHECK-O0-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-O0-NEXT: jmp LBB8_1 ; CHECK-O0-NEXT: LBB8_2: ; CHECK-O0-NEXT: addq $40, %rsp diff --git a/llvm/test/DebugInfo/MIR/InstrRef/phi-regallocd-to-stack.mir b/llvm/test/DebugInfo/MIR/InstrRef/phi-regallocd-to-stack.mir index ed04647c84066..31c694ee1e014 100644 --- a/llvm/test/DebugInfo/MIR/InstrRef/phi-regallocd-to-stack.mir +++ b/llvm/test/DebugInfo/MIR/InstrRef/phi-regallocd-to-stack.mir @@ -70,8 +70,7 @@ frameInfo: machineFunctionInfo: {} body: | ; CHECK-LABEL: bb.0: - ; CHECK: renamable $ebp = COPY $edi - ; CHECK: MOV32mr %stack.1, 1, $noreg, 0, $noreg, killed renamable $ebp + ; CHECK: MOV32mr %stack.[[SLOT:.+]], 1, $noreg, 0, $noreg, $edi bb.0: successors: %bb.2(0x50000000), %bb.1(0x30000000) liveins: $edi, $esi @@ -142,8 +141,8 @@ body: | %64:gr32 = PHI %24, %bb.0, %44, %bb.1, debug-location !18 DBG_INSTR_REF !14, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !12 - ; CHECK: DBG_PHI %stack.1, 1, 32 - ; CHECK: renamable $eax = MOV32rm %stack.1, + ; CHECK: DBG_PHI %stack.[[SLOT]], 1, 32 + ; CHECK: renamable $eax = MOV32rm %stack.[[SLOT]], ; CHECK: DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0) $eax = COPY killed %0, debug-location !19 RET 0, killed $eax, debug-location !19 diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/postidx-load.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/postidx-load.ll index 5976658ccdf86..f1e443b701bbe 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/postidx-load.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/postidx-load.ll @@ -10,23 +10,22 @@ define i32 @i32_initially_postidx(ptr %p, i64 %n) { ; CHECK-NEXT: cmp x1, #1 ; CHECK-NEXT: b.lt .LBB0_5 ; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr w9, [x0], #4 -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: ldr w9, [x8], #4 +; CHECK-NEXT: add w0, w0, w9 +; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: b.lo .LBB0_5 ; CHECK-NEXT: // %bb.3: // %for.inc ; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: subs x1, x1, #1 ; CHECK-NEXT: b.ne .LBB0_2 ; CHECK-NEXT: // %bb.4: // %cleanup -; CHECK-NEXT: mov w0, w8 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB0_5: -; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret entry: %cmp1 = icmp sgt i64 %n, 0 @@ -58,23 +57,22 @@ define i32 @i32_initially_offset(ptr %p, i64 %n) { ; CHECK-NEXT: cmp x1, #1 ; CHECK-NEXT: b.lt .LBB1_5 ; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: .LBB1_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr w9, [x0], #4 -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: ldr w9, [x8], #4 +; CHECK-NEXT: add w0, w0, w9 +; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: b.lo .LBB1_5 ; CHECK-NEXT: // %bb.3: // %for.cond ; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1 ; CHECK-NEXT: subs x1, x1, #1 ; CHECK-NEXT: b.ne .LBB1_2 ; CHECK-NEXT: // %bb.4: // %cleanup -; CHECK-NEXT: mov w0, w8 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB1_5: -; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret entry: %cmp1 = icmp sgt i64 %n, 0 diff --git a/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution.ll b/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution.ll index 7353acd7228cd..1f7a5b12d16b8 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution.ll @@ -7,27 +7,23 @@ target triple = "riscv64-unknown-linux-gnu" define ptr @foo(ptr %a0, ptr %a1, i64 %a2) { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mv a3, a0 ; CHECK-NEXT: vsetvli a4, a2, e8, m8, ta, ma -; CHECK-NEXT: bne a4, a2, .LBB0_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vle8.v v8, (a1) -; CHECK-NEXT: vse8.v v8, (a0) -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB0_2: # %if.then +; CHECK-NEXT: beq a4, a2, .LBB0_4 +; CHECK-NEXT: # %bb.1: # %if.then ; CHECK-NEXT: add a2, a0, a2 ; CHECK-NEXT: sub a5, a2, a4 -; CHECK-NEXT: mv a3, a0 -; CHECK-NEXT: .LBB0_3: # %do.body +; CHECK-NEXT: .LBB0_2: # %do.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: vse8.v v8, (a3) ; CHECK-NEXT: add a3, a3, a4 ; CHECK-NEXT: add a1, a1, a4 -; CHECK-NEXT: bltu a3, a5, .LBB0_3 -; CHECK-NEXT: # %bb.4: # %do.end +; CHECK-NEXT: bltu a3, a5, .LBB0_2 +; CHECK-NEXT: # %bb.3: # %do.end ; CHECK-NEXT: sub a2, a2, a3 ; CHECK-NEXT: vsetvli a2, a2, e8, m8, ta, ma +; CHECK-NEXT: .LBB0_4: # %if.end ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: vse8.v v8, (a3)